├── setup
├── .gitkeep
├── lambda-custom-resource
│ └── prepare_dev_package_cr.py
└── template.yaml
├── src
├── cloud
│ ├── pipelines
│ │ ├── semantic_segmentation
│ │ │ ├── requirements.txt
│ │ │ ├── __init__.py
│ │ │ ├── train_tf.py
│ │ │ ├── pipeline.py
│ │ │ └── preprocessing.py
│ │ ├── __init__.py
│ │ ├── image_classification
│ │ │ ├── __init__.py
│ │ │ ├── evaluation.py
│ │ │ ├── preprocessing.py
│ │ │ └── pipeline.py
│ │ ├── __version__.py
│ │ ├── _utils.py
│ │ ├── get_pipeline_definition.py
│ │ └── run_pipeline.py
│ ├── data_preparation.ipynb
│ ├── semantic_segmentation_pipeline.ipynb
│ └── image_classification_pipeline.ipynb
└── edge
│ ├── models_config.json
│ ├── requirements.txt
│ ├── app
│ ├── __init__.py
│ ├── util.py
│ ├── logger.py
│ ├── edgeagentclient.py
│ └── ota.py
│ ├── start_edge_agent.sh
│ ├── templates
│ ├── main_noimg.html
│ ├── base.html
│ └── main.html
│ ├── install.py
│ └── run.py
├── img
├── iot_job.png
├── pipeline.png
├── architecture.png
├── edge_config.png
├── inferece_ui.png
├── cloudformation.png
└── kolektor_sdd2.png
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── CONTRIBUTING.md
└── README.md
/setup/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/cloud/pipelines/semantic_segmentation/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python
--------------------------------------------------------------------------------
/img/iot_job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/iot_job.png
--------------------------------------------------------------------------------
/img/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/pipeline.png
--------------------------------------------------------------------------------
/src/cloud/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
--------------------------------------------------------------------------------
/img/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/architecture.png
--------------------------------------------------------------------------------
/img/edge_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/edge_config.png
--------------------------------------------------------------------------------
/img/inferece_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/inferece_ui.png
--------------------------------------------------------------------------------
/img/cloudformation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/cloudformation.png
--------------------------------------------------------------------------------
/img/kolektor_sdd2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/kolektor_sdd2.png
--------------------------------------------------------------------------------
/src/cloud/pipelines/image_classification/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
--------------------------------------------------------------------------------
/src/cloud/pipelines/semantic_segmentation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_STORE
2 | __pycache__/
3 | .ipynb_checkpoints
4 |
5 | # Ignore the data directory in git to not upload large files to repo
6 | src/cloud/data/
7 |
--------------------------------------------------------------------------------
/src/edge/models_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "mappings": {
3 | "image-classification-app": "img-classification",
4 | "image-segmentation-app": "unet"
5 | },
6 | "models": [
7 | ]
8 | }
--------------------------------------------------------------------------------
/src/edge/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 | Pillow==10.3.0
3 | sysv-ipc==1.1.0
4 | boto3==1.17.89
5 | grpcio-tools==1.38.0
6 | grpcio==1.53.2
7 | protobuf==3.18.3
8 | paho-mqtt==1.5.1
9 | waitress==3.0.1
10 | Flask==2.3.2
11 |
--------------------------------------------------------------------------------
/src/edge/app/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
4 |
5 | from .edgeagentclient import EdgeAgentClient
6 | from .ota import OTAModelUpdate
7 | from .logger import Logger
8 | from .util import *
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/src/cloud/pipelines/__version__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | """Metadata for the pipelines package."""
4 |
5 | __title__ = "sm-pipelines-defect-detection"
6 | __description__ = "Defect detection pipelines package"
7 | __version__ = "0.0.1"
8 | __author__ = "lichtend"
9 | __author_email__ = "lichtend@amazon.com"
10 | __license__ = "MIT"
11 | __url__ = "https://aws.amazon.com/"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
4 | software and associated documentation files (the "Software"), to deal in the Software
5 | without restriction, including without limitation the rights to use, copy, modify,
6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
7 | permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/src/cloud/pipelines/_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | """Provides utilities for SageMaker Pipeline CLI."""
4 | from __future__ import absolute_import
5 |
6 | import ast
7 |
8 |
9 | def get_pipeline_driver(module_name, passed_args=None):
10 | """Gets the driver for generating your pipeline definition.
11 |
12 | Pipeline modules must define a get_pipeline() module-level method.
13 |
14 | Args:
15 | module_name: The module name of your pipeline.
16 | passed_args: Optional passed arguments that your pipeline may be templated by.
17 |
18 | Returns:
19 | The SageMaker Workflow pipeline.
20 | """
21 | _imports = __import__(module_name, fromlist=["get_pipeline"])
22 | kwargs = convert_struct(passed_args)
23 | return _imports.get_pipeline(**kwargs)
24 |
25 |
26 | def convert_struct(str_struct=None):
27 | return ast.literal_eval(str_struct) if str_struct else {}
28 |
--------------------------------------------------------------------------------
/src/edge/start_edge_agent.sh:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | #!/bin/bash
4 | if [ "$SM_EDGE_AGENT_HOME" == "" ]; then
5 | echo "You need to define the env var: SM_EDGE_AGENT_HOME"
6 | exit
7 | fi
8 |
9 | echo "SM_EDGE_AGENT_HOME: $SM_EDGE_AGENT_HOME"
10 | AGENT_PID_FILE='/tmp/edge_agent.pid'
11 | APP_PID_FILE='/tmp/edge_app.pid'
12 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
13 |
14 | if ! test -f "$AGENT_PID_FILE" || ! kill -0 $(cat $AGENT_PID_FILE) 2> /dev/null; then
15 | echo "Starting the agent"
16 | rm -f nohup.out /tmp/edge_agent
17 | nohup $SM_EDGE_AGENT_HOME/bin/sagemaker_edge_agent_binary -a /tmp/edge_agent -c $SM_EDGE_AGENT_HOME/conf/config_edge_device.json >> $SM_EDGE_AGENT_HOME/logs/agent.log 2>&1 &
18 | AGENT_PID=$!
19 | echo $AGENT_PID > $AGENT_PID_FILE
20 | fi
21 | echo "AGENT PID: $(cat $AGENT_PID_FILE)"
22 |
23 | echo "Note: Please verify that the edge agent is running by using the command \"ps aux | grep [s]agemaker_edge_agent_binary\". In case you do not see any process running, please check the log file \"$SM_EDGE_AGENT_HOME/logs/agent.log\"".
--------------------------------------------------------------------------------
/src/edge/templates/main_noimg.html:
--------------------------------------------------------------------------------
1 |
3 | {% extends 'base.html' %} {% block header %}
4 |
5 | {% endblock %} {% block content %}
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | | Model Name |
14 | Model Version |
15 | Model Identifier |
16 |
17 |
18 |
19 | {% for m in loaded_models %}
20 |
21 | | {{ m.name }} |
22 | {{ m.version }} |
23 | {{ m.identifier }} |
24 |
25 | {% endfor %}
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | Error: No images to run inference on. Please download some test images into the ./static/ folder.
34 |
35 |
36 |
37 |
38 |
39 | {% endblock %}
40 |
--------------------------------------------------------------------------------
/src/edge/templates/base.html:
--------------------------------------------------------------------------------
1 |
3 |
4 | SageMaker Edge Application
5 |
6 |
7 |
8 |
26 |
27 |
28 |
29 |
30 | {% block header %}{% endblock %}
31 |
32 | {% for message in get_flashed_messages() %}
33 | {{ message }}
34 | {% endfor %}
35 | {% block content %}{% endblock %}
36 |
--------------------------------------------------------------------------------
/src/cloud/pipelines/get_pipeline_definition.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | """A CLI to get pipeline definitions from pipeline modules."""
4 | from __future__ import absolute_import
5 |
6 | import argparse
7 | import sys
8 |
9 | from pipelines._utils import get_pipeline_driver
10 |
11 |
12 | def main(): # pragma: no cover
13 | """The main harness that gets the pipeline definition JSON.
14 |
15 | Prints the json to stdout or saves to file.
16 | """
17 | parser = argparse.ArgumentParser("Gets the pipeline definition for the pipeline script.")
18 |
19 | parser.add_argument(
20 | "-n",
21 | "--module-name",
22 | dest="module_name",
23 | type=str,
24 | help="The module name of the pipeline to import.",
25 | )
26 | parser.add_argument(
27 | "-f",
28 | "--file-name",
29 | dest="file_name",
30 | type=str,
31 | default=None,
32 | help="The file to output the pipeline definition json to.",
33 | )
34 | parser.add_argument(
35 | "-kwargs",
36 | "--kwargs",
37 | dest="kwargs",
38 | default=None,
39 | help="Dict string of keyword arguments for the pipeline generation (if supported)",
40 | )
41 | args = parser.parse_args()
42 |
43 | if args.module_name is None:
44 | parser.print_help()
45 | sys.exit(2)
46 |
47 | try:
48 | pipeline = get_pipeline_driver(args.module_name, args.kwargs)
49 | content = pipeline.definition()
50 | if args.file_name:
51 | with open(args.file_name, "w") as f:
52 | f.write(content)
53 | else:
54 | print(content)
55 | except Exception as e: # pylint: disable=W0703
56 | print(f"Exception: {e}")
57 | sys.exit(1)
58 |
59 |
60 | if __name__ == "__main__":
61 | main()
62 |
--------------------------------------------------------------------------------
/src/edge/app/util.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import numpy as np
4 | import boto3
5 | import requests
6 | import PIL
7 | import io
8 | import base64
9 |
10 | def create_dataset(X, time_steps=1, step=1):
11 | '''
12 | Format a timeseries buffer into a multidimensional tensor
13 | required by the model
14 | '''
15 | Xs = []
16 | for i in range(0, len(X) - time_steps, step):
17 | v = X[i:(i + time_steps)]
18 | Xs.append(v)
19 | return np.array(Xs)
20 |
21 | def get_aws_credentials(cred_endpoint, thing_name, cert_file, key_file, ca_file):
22 | '''
23 | Invoke SageMaker Edge Manager endpoint to exchange the certificates
24 | by temp credentials
25 | '''
26 | resp = requests.get(
27 | cred_endpoint,
28 | cert=(cert_file, key_file, ca_file),
29 | )
30 | if not resp:
31 | raise Exception('Error while getting the IoT credentials: ', resp)
32 | credentials = resp.json()
33 | return (credentials['credentials']['accessKeyId'],
34 | credentials['credentials']['secretAccessKey'],
35 | credentials['credentials']['sessionToken'])
36 |
37 | def get_client(service_name, iot_params):
38 | '''
39 | Build a boto3 client of a given service
40 | It uses the temp credentials exchanged by the certificates
41 | '''
42 | access_key_id,secret_access_key,session_token = get_aws_credentials(
43 | iot_params['sagemaker_edge_provider_aws_iot_cred_endpoint'],
44 | iot_params['sagemaker_edge_core_device_name'],
45 | iot_params['sagemaker_edge_provider_aws_cert_file'],
46 | iot_params['sagemaker_edge_provider_aws_cert_pk_file'],
47 | iot_params['sagemaker_edge_provider_aws_ca_cert_file']
48 | )
49 | return boto3.client(
50 | service_name, iot_params['sagemaker_edge_core_region'],
51 | aws_access_key_id=access_key_id,
52 | aws_secret_access_key=secret_access_key,
53 | aws_session_token=session_token
54 | )
55 |
56 | def create_b64_img_from_mask(mask):
57 | """Creates binary stream from (1, SIZE, SIZE)-shaped binary mask"""
58 | img_size = mask.shape[1]
59 | mask_reshaped = np.reshape(mask, (img_size, img_size))
60 | img = PIL.Image.fromarray(np.uint8(mask_reshaped)*255)
61 | img_binary = io.BytesIO()
62 | img.save(img_binary, 'PNG')
63 | img_b64 = base64.b64encode(img_binary.getvalue())
64 | return img_b64
--------------------------------------------------------------------------------
/src/edge/app/logger.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import threading
4 | import json
5 | import logging
6 | import app.util as util
7 |
8 | IOT_BASE_TOPIC = 'edge-manager-app'
9 |
10 | class Logger(object):
11 | def __init__(self, device_name, iot_params):
12 | '''
13 | This class is responsible for sending application logs
14 | to the cloud via MQTT and IoT Topics
15 | '''
16 | self.device_name = device_name
17 | logging.info("Device Name: %s" % self.device_name)
18 | self.iot_params = iot_params
19 |
20 | self.__update_credentials()
21 |
22 | self.logs_buffer = []
23 | self.__log_lock = threading.Lock()
24 |
25 | def __update_credentials(self):
26 | '''
27 | Get new temp credentials
28 | '''
29 | logging.info("Getting the IoT Credentials")
30 | self.iot_data_client = util.get_client('iot-data', self.iot_params)
31 |
32 | def __run_logs_upload_job__(self):
33 | '''
34 | Launch a thread that will read the logs buffer
35 | prepare a json document and send the logs
36 | '''
37 | self.cloud_log_sync_job = threading.Thread(target=self.__upload_logs__)
38 | self.cloud_log_sync_job.start()
39 |
40 | def __upload_logs__(self):
41 | '''
42 | Invoked by the thread to publish the latest logs
43 | '''
44 | self.__log_lock.acquire(True)
45 | f = json.dumps({'logs': self.logs_buffer})
46 | self.logs_buffer = [] # clean the buffer
47 | try:
48 | self.iot_data_client.publish( topic='%s/logs/%s' % (IOT_BASE_TOPIC, self.device_name), payload=f.encode('utf-8') )
49 | except Exception as e:
50 | logging.error(e)
51 | self.__update_credentials()
52 | self.iot_data_client.publish( topic='%s/logs/%s' % (IOT_BASE_TOPIC, self.device_name), payload=f.encode('utf-8') )
53 |
54 | logging.info("New log file uploaded. len: %d" % len(f))
55 | self.__log_lock.release()
56 |
57 | def publish_logs(self, data):
58 | '''
59 | Invoked by the application, it buffers the logs
60 | '''
61 | buffer_len = 0
62 | if self.__log_lock.acquire(False):
63 | self.logs_buffer.append(data)
64 | buffer_len = len(self.logs_buffer)
65 | self.__log_lock.release()
66 | # else: job is running, discard the new data
67 | if buffer_len > 10:
68 | # run the sync job
69 | self.__run_logs_upload_job__()
--------------------------------------------------------------------------------
/src/cloud/pipelines/run_pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | """A CLI to create or update and run pipelines."""
4 | from __future__ import absolute_import
5 |
6 | import argparse
7 | import json
8 | import sys
9 |
10 | from pipelines._utils import get_pipeline_driver, convert_struct
11 |
12 |
13 | def main(): # pragma: no cover
14 | """The main harness that creates or updates and runs the pipeline.
15 |
16 | Creates or updates the pipeline and runs it.
17 | """
18 | parser = argparse.ArgumentParser(
19 | "Creates or updates and runs the pipeline for the pipeline script."
20 | )
21 |
22 | parser.add_argument(
23 | "-n",
24 | "--module-name",
25 | dest="module_name",
26 | type=str,
27 | help="The module name of the pipeline to import.",
28 | )
29 | parser.add_argument(
30 | "-kwargs",
31 | "--kwargs",
32 | dest="kwargs",
33 | default=None,
34 | help="Dict string of keyword arguments for the pipeline generation (if supported)",
35 | )
36 | parser.add_argument(
37 | "-role-arn",
38 | "--role-arn",
39 | dest="role_arn",
40 | type=str,
41 | help="The role arn for the pipeline service execution role.",
42 | )
43 | parser.add_argument(
44 | "-description",
45 | "--description",
46 | dest="description",
47 | type=str,
48 | default=None,
49 | help="The description of the pipeline.",
50 | )
51 | parser.add_argument(
52 | "-tags",
53 | "--tags",
54 | dest="tags",
55 | default=None,
56 | help="""List of dict strings of '[{"Key": "string", "Value": "string"}, ..]'""",
57 | )
58 | args = parser.parse_args()
59 |
60 | if args.module_name is None or args.role_arn is None:
61 | parser.print_help()
62 | sys.exit(2)
63 | tags = convert_struct(args.tags)
64 |
65 | try:
66 | pipeline = get_pipeline_driver(args.module_name, args.kwargs)
67 | print("###### Creating/updating a SageMaker Pipeline with the following definition:")
68 | parsed = json.loads(pipeline.definition())
69 | print(json.dumps(parsed, indent=2, sort_keys=True))
70 |
71 | upsert_response = pipeline.upsert(
72 | role_arn=args.role_arn, description=args.description, tags=tags
73 | )
74 | print("\n###### Created/Updated SageMaker Pipeline: Response received:")
75 | print(upsert_response)
76 |
77 | execution = pipeline.start()
78 | print(f"\n###### Execution started with PipelineExecutionArn: {execution.arn}")
79 |
80 | print("Waiting for the execution to finish...")
81 | execution.wait()
82 | print("\n#####Execution completed. Execution step details:")
83 |
84 | print(execution.list_steps())
85 | # Todo print the status?
86 | except Exception as e: # pylint: disable=W0703
87 | print(f"Exception: {e}")
88 | sys.exit(1)
89 |
90 |
91 | if __name__ == "__main__":
92 | main()
93 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/src/edge/templates/main.html:
--------------------------------------------------------------------------------
1 |
3 | {% extends 'base.html' %} {% block header %}
4 |
5 | {% endblock %} {% block content %}
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | | Model Name |
14 | Model Version |
15 | Model Identifier |
16 |
17 |
18 |
19 | {% for m in loaded_models %}
20 |
21 | | {{ m.name }} |
22 | {{ m.version }} |
23 | {{ m.identifier }} |
24 |
25 | {% endfor %}
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
Filename: {{ image_file }}
35 |
36 |
}})
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
Image Classification
52 | {% if (y_clf_class == 'normal') %}
53 |
Latency: {{ latency_clf }} ms
54 |
58 |
59 |
Normal
60 |
61 | Confidence: normal={{ y_clf_normal }} / anomalous={{
62 | y_clf_anomalous }}
63 |
64 |
65 |
66 | {% elif (y_clf_class == 'anomalous') %}
67 |
Latency: {{ latency_clf }} ms
68 |
72 |
73 |
Anomalous
74 |
75 | Confidence: normal={{ y_clf_normal }} / anomalous={{
76 | y_clf_anomalous }}
77 |
78 |
79 |
80 | {% else %}
81 |
No image classification result available
82 | {% endif %}
83 |
84 |
85 |
86 |
Semantic Segmentation
87 | {% if y_segm_img %}
88 |
Latency: {{ latency_segm }} ms
89 |
90 |

91 |
92 | {% else %}
93 |
No segmentation mask available
94 | {% endif %}
95 |
96 |
97 |
98 |
99 | {% endblock %}
100 |
--------------------------------------------------------------------------------
/src/edge/install.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import boto3
4 | import os
5 | import tarfile
6 | import stat
7 | import io
8 | import logging
9 | import argparse
10 | import pathlib
11 |
12 | logger = logging.getLogger(__name__)
13 | logging.basicConfig(level=logging.INFO)
14 |
15 | s3_client = boto3.client('s3')
16 |
17 | # Default bucket for downloading the SM Edge Agent. Please note that your device needs access to this bucket through IAM
18 | agent_config_package_prefix = 'edge-device-configuration/agent/config.tgz'
19 | agent_version = '1.20210820.e20fa3a'
20 | agent_pkg_bucket = 'sagemaker-edge-release-store-us-west-2-linux-x64'
21 |
22 | def replace_pathnames_in_config(configfile):
23 | """Replaces the pathnames in the agent config to use absolute paths"""
24 | # Read in the file
25 | with open(configfile, 'r') as file :
26 | filedata = file.read()
27 |
28 | # Replace the target string
29 | basepath = str(pathlib.Path().resolve())
30 | filedata = filedata.replace('$WORKDIR', basepath)
31 |
32 | # Write the file out again
33 | with open(configfile, 'w') as file:
34 | file.write(filedata)
35 |
36 | def download_config(bucket_name):
37 | # Check if agent is installed and configured already
38 | if not os.path.isdir('agent'):
39 | logger.info('No SM Edge Agent directory found. Proceeding with download of configuration package...')
40 |
41 | # Get the configuration package with certificates and config files
42 | with io.BytesIO() as file:
43 | s3_client.download_fileobj(bucket_name, agent_config_package_prefix, file)
44 | file.seek(0)
45 | # Extract the files
46 | tar = tarfile.open(fileobj=file)
47 | tar.extractall('.')
48 | tar.close()
49 |
50 | # Replace the variables in the config file to make paths absolute
51 | logger.info('Replacing path names in Edge Agent configuration file...')
52 | replace_pathnames_in_config('./agent/conf/config_edge_device.json')
53 |
54 | # Download and install SageMaker Edge Manager
55 | agent_pkg_key = 'Releases/%s/%s.tgz' % (agent_version, agent_version)
56 | # get the agent package
57 | logger.info('Downloading and installing SageMaker Edge Agent binaries version \"%s\"...' % agent_version)
58 |
59 | with io.BytesIO() as file:
60 | s3_client.download_fileobj(agent_pkg_bucket, agent_pkg_key, file)
61 | file.seek(0)
62 | # Extract the files
63 | tar = tarfile.open(fileobj=file)
64 | tar.extractall('agent')
65 | tar.close()
66 | # Adjust the permissions
67 | os.chmod('agent/bin/sagemaker_edge_agent_binary', stat.S_IXUSR|stat.S_IWUSR|stat.S_IXGRP|stat.S_IWGRP)
68 |
69 | # Finally, create SM Edge Agent client stubs, using protobuffer compiler
70 | logger.info('Creating protobuf agent stubs...')
71 | os.system('mkdir -p app/')
72 | os.system('python3 -m grpc_tools.protoc --proto_path=agent/docs/api --python_out=app/ --grpc_python_out=app/ agent/docs/api/agent.proto')
73 |
74 | if __name__ == '__main__':
75 | parser =argparse.ArgumentParser()
76 | parser.add_argument('--project-name', type=str, required=True)
77 | parser.add_argument('--account-id', type=str, required=True)
78 | args, _ = parser.parse_known_args()
79 |
80 | logger.info('Preparing device...')
81 |
82 | # Infer bucket name from project name and AWS Account ID as created in the CloudFormation template
83 | bucket_name = 'sm-edge-workshop-%s-%s' % (args.project_name, args.account_id)
84 |
85 | # Run the installation script
86 | download_config(bucket_name)
87 |
88 | logger.info('Done!')
89 |
90 |
--------------------------------------------------------------------------------
/src/cloud/pipelines/image_classification/evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import sys
4 | import os
5 | import subprocess
6 |
7 | # Install packages previous to executing the rest of the script. You can also build your own custom container
8 | # with your individal dependecies if needed
9 | subprocess.check_call([sys.executable, "-m", "pip", "install", "mxnet", "opencv-python"])
10 | os.system("apt-get update")
11 | os.system("apt-get install ffmpeg libsm6 libxext6 -y")
12 |
13 | import argparse
14 | import json
15 | import warnings
16 | import logging
17 | import pandas as pd
18 | import numpy as np
19 | from glob import glob
20 | from datetime import datetime
21 | import tarfile
22 | from PIL import Image
23 | from glob import glob
24 | import re
25 |
26 | import mxnet as mx
27 | import mxnet.ndarray as nd
28 | from mxnet import nd, gluon
29 | from mxnet.gluon.data.vision import transforms
30 | from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
31 |
32 | # Constants
33 |
34 | # The images size used
35 |
36 | CLASS_LABELS = ['good', 'bad']
37 |
38 | logger = logging.getLogger()
39 | logger.setLevel(logging.INFO)
40 | logger.addHandler(logging.StreamHandler())
41 |
42 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
43 |
44 | if __name__=='__main__':
45 | parser = argparse.ArgumentParser()
46 | parser.add_argument('--image-width', type=int, default=224)
47 | parser.add_argument('--image-height', type=int, default=224)
48 | args, _ = parser.parse_known_args()
49 |
50 | logger.info('Received arguments {}'.format(args))
51 |
52 | # Define the paths
53 | test_data_base_path = '/opt/ml/processing/test'
54 | model_data_base_path = '/opt/ml/processing/model'
55 | report_output_base_path = '/opt/ml/processing/report'
56 |
57 | IMAGE_WIDTH = int(args.image_width)
58 | IMAGE_HEIGHT = int(args.image_height)
59 |
60 | # Unzipping the model
61 | model_filename = 'model.tar.gz'
62 | model_path = os.path.join(model_data_base_path, model_filename)
63 | model_path_extracted = './model/'
64 |
65 | with tarfile.open(model_path) as tar:
66 | tar.extractall(path=model_path_extracted)
67 |
68 | # Get the files needed for loading, parse some strings
69 | symbol_file = glob(os.path.join(model_path_extracted, '*symbol.json'))[0]
70 | params_file = glob(os.path.join(model_path_extracted, '*.params'))[0]
71 |
72 | logger.info('Symbol file: %s' % symbol_file)
73 | logger.info('Params file: %s' % params_file)
74 |
75 | symbol_filename = os.path.basename(symbol_file)
76 | params_filename = os.path.basename(params_file)
77 |
78 | # Extract name and epoch needed for loading
79 | model_name = re.search(r".+(?=-symbol\.json)", symbol_filename).group(0)
80 | epoch = int(re.search(r"[0-9]+(?=\.params)", params_filename).group(0))
81 |
82 | # Loading model
83 | logger.info('Loading model from artifacts...')
84 | sym, arg_params, aux_params = mx.model.load_checkpoint(os.path.join(model_path_extracted, model_name), epoch)
85 | model = mx.mod.Module(symbol=sym, context=mx.cpu(), label_names=['data'])
86 | model.bind(for_training=False, data_shapes=[('data', (1,3,IMAGE_WIDTH,IMAGE_HEIGHT))],
87 | label_shapes=model._label_shapes)
88 | model.set_params(arg_params, aux_params, allow_missing=True)
89 |
90 | # Load test data into record iterator (batch size 1)
91 | test_data = mx.io.ImageRecordIter(
92 | path_imgrec = os.path.join(test_data_base_path, 'test.rec'),
93 | data_shape = (3, IMAGE_WIDTH, IMAGE_HEIGHT),
94 | batch_size = 1,
95 | shuffle = True
96 | )
97 |
98 | # Lists for the predicted and true labels
99 | y_true = []
100 | y_pred = []
101 |
102 | # For each batch (size=1) predict the class
103 | # TODO: make batch prediction work
104 | for batch in test_data:
105 | res = model.predict(eval_data=batch.data[0])
106 | pred_class = int(np.argmax(res[0]).asnumpy()[0])
107 | y_pred.append(pred_class)
108 | y_true.append(int(batch.label[0].asnumpy()))
109 |
110 | clf_report = classification_report(y_true, y_pred, target_names=CLASS_LABELS, output_dict=True)
111 | accuracy = accuracy_score(y_true, y_pred)
112 |
113 | # Save the preprocessing report to make information available to downstream steps
114 | evaluation_report = {
115 | 'multiclass_classification_metrics': {
116 | 'accuracy': {
117 | 'value': accuracy,
118 | 'standard_deviation': 'NaN'
119 | },
120 | 'weighted_recall': {
121 | 'value': clf_report['weighted avg']['recall'],
122 | 'standard_deviation': 'NaN'
123 | },
124 | 'weighted_precision': {
125 | 'value': clf_report['weighted avg']['precision'],
126 | 'standard_deviation': 'NaN'
127 | },
128 | 'weighted_f1': {
129 | 'value': clf_report['weighted avg']['f1-score'],
130 | 'standard_deviation': 'NaN'
131 | }
132 | },
133 | 'classification_report': clf_report
134 | }
135 | print('Evaluation report:', evaluation_report)
136 | report_output_path = os.path.join(report_output_base_path, 'evaluation_report.json')
137 | with open(report_output_path, "w") as f:
138 | f.write(json.dumps(evaluation_report))
139 |
--------------------------------------------------------------------------------
/src/edge/app/edgeagentclient.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | # From: https://github.com/aws-samples/amazon-sagemaker-edge-manager-demo/blob/main/04_EdgeApplication/turbine/edgeagentclient.py
4 |
5 | import grpc
6 | import logging
7 | import app.agent_pb2 as agent
8 | import app.agent_pb2_grpc as agent_grpc
9 | import struct
10 | import numpy as np
11 | import uuid
12 |
13 | class EdgeAgentClient(object):
14 | """ Helper class that uses the Edge Agent stubs to
15 | communicate with the SageMaker Edge Agent through unix socket.
16 |
17 | To generate the stubs you need to use protoc. First install/update:
18 | pip3 install -U grpcio-tools grpcio protobuf
19 | then generate the code using the provided agent.proto file
20 |
21 | python3 -m grpc_tools.protoc \
22 | --proto_path=$PWD/agent/docs/api --python_out=./app --grpc_python_out=./app $PWD/agent/docs/api/agent.proto
23 |
24 | """
25 | def __init__(self, channel_path):
26 | # connect to the agent and list the models
27 | self.channel = grpc.insecure_channel('unix://%s' % channel_path )
28 | self.agent = agent_grpc.AgentStub(self.channel)
29 | self.model_map = {}
30 |
31 | def __update_models_list__(self):
32 | models_list = self.agent.ListModels(agent.ListModelsRequest())
33 | self.model_map = {m.name:{'in': m.input_tensor_metadatas, 'out': m.output_tensor_metadatas} for m in models_list.models}
34 | return self.model_map
35 |
36 | def capture_data(self, model_name, input_data, output_data):
37 | """The CaptureData request to the edge agent"""
38 | try:
39 | logging.info('Capturing data for model %s' % model_name)
40 | req = agent.CaptureDataRequest()
41 | req.model_name = model_name
42 | req.capture_id = str(uuid.uuid4())
43 | req.input_tensors.append( self.create_tensor(input_data, 'input'))
44 | req.output_tensors.append( self.create_tensor(output_data, 'output'))
45 | resp = self.agent.CaptureData(req)
46 | except Exception as e:
47 | logging.error('Error in capture_data: %s' % e)
48 |
49 | def create_tensor(self, x, tensor_name):
50 | """Creates a Edge agent tensor from a numpy float32 array"""
51 | if (x.dtype != np.float32):
52 | raise Exception( "It only supports numpy float32 arrays for this tensor but type for tensor %s was %s" % (tensor_name, x.dtype))
53 | tensor = agent.Tensor()
54 | tensor.tensor_metadata.name = tensor_name.encode()
55 | tensor.tensor_metadata.data_type = agent.FLOAT32
56 | for s in x.shape: tensor.tensor_metadata.shape.append(s)
57 | tensor.byte_data = x.tobytes()
58 | return tensor
59 |
60 | def predict(self, model_name, x, shm=False):
61 | """
62 | Invokes the model and get the predictions
63 | """
64 | try:
65 | if self.model_map.get(model_name) is None:
66 | raise Exception('Model %s not loaded' % model_name)
67 | # Create a request
68 | req = agent.PredictRequest()
69 | req.name = model_name
70 | # Then load the data into a temp Tensor
71 | tensor = agent.Tensor()
72 | meta = self.model_map[model_name]['in'][0]
73 | tensor.tensor_metadata.name = meta.name
74 | tensor.tensor_metadata.data_type = meta.data_type
75 | for s in meta.shape: tensor.tensor_metadata.shape.append(s)
76 |
77 | if shm:
78 | tensor.shared_memory_handle.offset = 0
79 | tensor.shared_memory_handle.segment_id = x
80 | else:
81 | tensor.byte_data = x.astype(np.float32).tobytes()
82 |
83 | req.tensors.append(tensor)
84 |
85 | # Invoke the model
86 | resp = self.agent.Predict(req)
87 |
88 | # Parse the output
89 | meta = self.model_map[model_name]['out'][0]
90 | tensor = resp.tensors[0]
91 | data = np.frombuffer(tensor.byte_data, dtype=np.float32)
92 | return data.reshape(tensor.tensor_metadata.shape)
93 | except Exception as e:
94 | logging.error('Error in predict: %s' % e)
95 | return None
96 |
97 | def is_model_loaded(self, model_name):
98 | return self.model_map.get(model_name) is not None
99 |
100 | def load_model(self, model_name, model_path):
101 | """ Load a new model into the Edge Agent if not loaded yet"""
102 | try:
103 | if self.is_model_loaded(model_name):
104 | logging.info( "Model %s was already loaded" % model_name )
105 | return self.model_map
106 | req = agent.LoadModelRequest()
107 | req.url = model_path
108 | req.name = model_name
109 | resp = self.agent.LoadModel(req)
110 |
111 | return self.__update_models_list__()
112 | except Exception as e:
113 | logging.error('Error in load_model: %s' % e)
114 | return None
115 |
116 | def unload_model(self, model_name):
117 | """ UnLoad model from the Edge Agent"""
118 | try:
119 | if not self.is_model_loaded(model_name):
120 | logging.info( "Model %s was not loaded" % model_name )
121 | return self.model_map
122 |
123 | req = agent.UnLoadModelRequest()
124 | req.name = model_name
125 | resp = self.agent.UnLoadModel(req)
126 |
127 | return self.__update_models_list__()
128 | except Exception as e:
129 | logging.error('Error in unload_model: %s' % e)
130 | return None
--------------------------------------------------------------------------------
/src/cloud/data_preparation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "source": [
6 | "# Defect Detection at the edge using Amazon SageMaker - Data preparation and preprocessing\n",
7 | "In this notebook, we will download the dataset and preprocess it accordingly to be used with the provided training pipelines."
8 | ],
9 | "metadata": {}
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "source": [
15 | "import boto3\n",
16 | "import time\n",
17 | "import uuid\n",
18 | "import json\n",
19 | "import numpy as np\n",
20 | "import pandas as pd\n",
21 | "from PIL import Image\n",
22 | "import glob, os\n",
23 | "from shutil import copyfile\n",
24 | "import sagemaker\n",
25 | "\n",
26 | "sts_client = boto3.client('sts')\n",
27 | "\n",
28 | "# Get the account id\n",
29 | "account_id = sts_client.get_caller_identity()[\"Account\"]\n",
30 | "\n",
31 | "# Project Name as defined in your CloudFormation template\n",
32 | "PROJECT_NAME = ''\n",
33 | "\n",
34 | "region = boto3.Session().region_name\n",
35 | "role = sagemaker.get_execution_role()\n",
36 | "bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)"
37 | ],
38 | "outputs": [],
39 | "metadata": {}
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "source": [
45 | "# Download the dataset\n",
46 | "!mkdir ./data\n",
47 | "!wget -P ./data http://go.vicos.si/kolektorsdd2"
48 | ],
49 | "outputs": [],
50 | "metadata": {}
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "source": [
56 | "# Extract it\n",
57 | "!unzip ./data/kolektorsdd2 -d ./data/kolektor "
58 | ],
59 | "outputs": [],
60 | "metadata": {}
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "source": [
66 | "# Define some utilities\n",
67 | "\n",
68 | "def img_read(path):\n",
69 | " \"\"\"Read image as numpy array\"\"\"\n",
70 | " with Image.open(path) as i:\n",
71 | " img = np.asarray(i)\n",
72 | " return img\n",
73 | "\n",
74 | "def img_is_anomalous(img):\n",
75 | " \"\"\"Assess whether an image is anomalous by assuming non-black masks are anomalous\"\"\"\n",
76 | " if np.mean(img) > 0:\n",
77 | " return True\n",
78 | " else:\n",
79 | " return False\n",
80 | " \n",
81 | "def sort_img_by_mask(mask_file, dir_normal, dir_anomalous):\n",
82 | " \"\"\"Copy file into specified directories based on mask\"\"\"\n",
83 | " mask_img = img_read(mask_file)\n",
84 | " data_img = mask_file.replace('_GT', '')\n",
85 | " if img_is_anomalous(mask_img):\n",
86 | " copyfile(data_img, os.path.join(dir_anomalous, os.path.basename(data_img)))\n",
87 | " else:\n",
88 | " copyfile(data_img, os.path.join(dir_normal, os.path.basename(data_img)))\n",
89 | " return"
90 | ],
91 | "outputs": [],
92 | "metadata": {}
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "source": [
98 | "# Define the base directory where the files are located and get a list of all the maks files\n",
99 | "directory = './data/kolektor/train/'\n",
100 | "mask_files = [f for f in glob.glob(os.path.join(directory, '*_GT.png'))]"
101 | ],
102 | "outputs": [],
103 | "metadata": {}
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "source": [
109 | "# Create folders for the preprocessed images\n",
110 | "!mkdir ./data/kolektor-preprocessed\n",
111 | "!mkdir ./data/kolektor-preprocessed/img-classification\n",
112 | "!mkdir ./data/kolektor-preprocessed/img-classification/normal\n",
113 | "!mkdir ./data/kolektor-preprocessed/img-classification/anomalous\n",
114 | "\n",
115 | "!mkdir ./data/kolektor-preprocessed/semantic-segmentation\n",
116 | "!mkdir ./data/kolektor-preprocessed/semantic-segmentation/images\n",
117 | "!mkdir ./data/kolektor-preprocessed/semantic-segmentation/masks"
118 | ],
119 | "outputs": [],
120 | "metadata": {}
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "source": [
126 | "# Read the files and sort them by mask file. If the mask file is just black, we assume that there is no anomaly and thus categorize it as \"normal\"\n",
127 | "\n",
128 | "dir_normal = './data/kolektor-preprocessed/img-classification/normal'\n",
129 | "dir_anomalous = './data/kolektor-preprocessed/img-classification/anomalous'\n",
130 | "\n",
131 | "for mask_file in mask_files:\n",
132 | " sort_img_by_mask(mask_file, dir_normal, dir_anomalous)"
133 | ],
134 | "outputs": [],
135 | "metadata": {}
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "source": [
141 | "# Sort the files into different folders for their masks and base images\n",
142 | "\n",
143 | "all_files = [f for f in glob.glob(os.path.join(directory, '*.png'))]\n",
144 | "dir_images = './data/kolektor-preprocessed/semantic-segmentation/images'\n",
145 | "dir_masks = './data/kolektor-preprocessed/semantic-segmentation/masks'\n",
146 | "\n",
147 | "for img_path in all_files:\n",
148 | " if '_GT' in img_path:\n",
149 | " # image is mask, sort into mask subdirectory\n",
150 | " copyfile(img_path, os.path.join(dir_masks, os.path.basename(img_path).replace('_GT', '')))\n",
151 | " else:\n",
152 | " copyfile(img_path, os.path.join(dir_images, os.path.basename(img_path)))"
153 | ],
154 | "outputs": [],
155 | "metadata": {}
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "source": [
161 | "# Copy to S3 bucket\n",
162 | "!aws s3 cp --recursive --quiet ./data/kolektor-preprocessed/ s3://$bucket_name/data/"
163 | ],
164 | "outputs": [],
165 | "metadata": {}
166 | }
167 | ],
168 | "metadata": {
169 | "instance_type": "ml.t3.medium",
170 | "kernelspec": {
171 | "display_name": "Python 3 (Data Science)",
172 | "language": "python",
173 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0"
174 | },
175 | "language_info": {
176 | "codemirror_mode": {
177 | "name": "ipython",
178 | "version": 3
179 | },
180 | "file_extension": ".py",
181 | "mimetype": "text/x-python",
182 | "name": "python",
183 | "nbconvert_exporter": "python",
184 | "pygments_lexer": "ipython3",
185 | "version": "3.7.10"
186 | }
187 | },
188 | "nbformat": 4,
189 | "nbformat_minor": 4
190 | }
191 |
--------------------------------------------------------------------------------
/src/cloud/pipelines/semantic_segmentation/train_tf.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import argparse
4 | import numpy as np
5 | import os
6 | from glob import glob
7 | import cv2
8 | import tensorflow as tf
9 | from tensorflow import keras
10 | import pandas as pd
11 | from tensorflow.keras.layers import Conv2D, Activation, BatchNormalization
12 | from tensorflow.keras.layers import UpSampling2D, Input, Concatenate
13 | from tensorflow.keras.models import Model
14 | from tensorflow.keras.applications import MobileNetV2
15 | from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
16 | from tensorflow.keras.metrics import Recall, Precision
17 | from tensorflow.keras import backend as K
18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
19 |
20 | IMAGE_WIDTH=224
21 | IMAGE_HEIGHT=224
22 |
23 | def parse_args():
24 |
25 | parser = argparse.ArgumentParser()
26 |
27 | # hyperparameters sent by the client are passed as command-line arguments to the script
28 | parser.add_argument('--epochs', type=int, default=100)
29 | parser.add_argument('--batch_size', type=int, default=8)
30 | parser.add_argument('--learning_rate', type=float, default=1e-4)
31 |
32 | # data directories
33 | parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
34 | parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
35 |
36 | # model directory: we will use the default set by SageMaker, /opt/ml/model
37 | parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
38 |
39 | return parser.parse_known_args()
40 |
41 | def read_image(path):
42 | path = path.decode()
43 | x = cv2.imread(path, cv2.IMREAD_COLOR)
44 | x = cv2.resize(x, (IMAGE_WIDTH, IMAGE_HEIGHT))
45 | x = x/255.0
46 | return x
47 |
48 | def read_mask(path):
49 | path = path.decode()
50 | x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
51 | x = cv2.resize(x, (IMAGE_WIDTH, IMAGE_HEIGHT))
52 | x = x/255.0
53 | x = np.expand_dims(x, axis=-1)
54 | return x
55 |
56 | def tf_parse(x, y):
57 | def _parse(x, y):
58 | x = read_image(x)
59 | y = read_mask(y)
60 | return x, y
61 |
62 | x, y = tf.numpy_function(_parse, [x, y], [tf.float64, tf.float64])
63 | x.set_shape([IMAGE_WIDTH, IMAGE_HEIGHT, 3])
64 | y.set_shape([IMAGE_WIDTH, IMAGE_HEIGHT, 1])
65 | return x, y
66 |
67 | def tf_dataset(x, y, batch=8):
68 | dataset = tf.data.Dataset.from_tensor_slices((x, y))
69 | dataset = dataset.map(tf_parse)
70 | dataset = dataset.batch(batch)
71 | dataset = dataset.repeat()
72 | return dataset
73 |
74 |
75 | def model():
76 | inputs = Input(shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 3), name="input_image")
77 |
78 | encoder = MobileNetV2(input_tensor=inputs, weights="imagenet", include_top=False, alpha=0.35)
79 | skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"]
80 | encoder_output = encoder.get_layer("block_13_expand_relu").output
81 |
82 | f = [16, 32, 48, 64]
83 | x = encoder_output
84 | for i in range(1, len(skip_connection_names)+1, 1):
85 | x_skip = encoder.get_layer(skip_connection_names[-i]).output
86 | x = UpSampling2D((2, 2))(x)
87 | x = Concatenate()([x, x_skip])
88 |
89 | x = Conv2D(f[-i], (3, 3), padding="same")(x)
90 | x = BatchNormalization()(x)
91 | x = Activation("relu")(x)
92 |
93 | x = Conv2D(f[-i], (3, 3), padding="same")(x)
94 | x = BatchNormalization()(x)
95 | x = Activation("relu")(x)
96 |
97 | x = Conv2D(1, (1, 1), padding="same")(x)
98 | x = Activation("sigmoid")(x)
99 |
100 | model = Model(inputs, x)
101 | return model
102 |
103 |
104 | def dice_coef(y_true, y_pred):
105 | smooth = 1e-15
106 | y_true = tf.keras.layers.Flatten()(y_true)
107 | y_pred = tf.keras.layers.Flatten()(y_pred)
108 | intersection = tf.reduce_sum(y_true * y_pred)
109 | return (2. * intersection + smooth) / (tf.reduce_sum(y_true) + tf.reduce_sum(y_pred) + smooth)
110 |
111 | def dice_loss(y_true, y_pred):
112 | return 1.0 - dice_coef(y_true, y_pred)
113 |
114 | def get_train_data(train_files_path,validation_files_path):
115 |
116 | train_x = sorted(glob(os.path.join(train_files_path, "images/*")))
117 | train_y = sorted(glob(os.path.join(train_files_path, "masks/*")))
118 |
119 | valid_x = sorted(glob(os.path.join(validation_files_path, "images/*")))
120 | valid_y = sorted(glob(os.path.join(validation_files_path, "masks/*")))
121 |
122 |
123 |
124 | return train_x,train_y,valid_x,valid_y
125 |
126 |
127 | if __name__ == "__main__":
128 |
129 | args, _ = parse_args()
130 | EPOCHS = args.epochs
131 | BATCH = args.batch_size
132 | LR = args.learning_rate
133 |
134 | train_x,train_y,valid_x,valid_y = get_train_data(args.train,args.validation)
135 | train_dataset = tf_dataset(train_x, train_y, batch=BATCH)
136 | valid_dataset = tf_dataset(valid_x, valid_y, batch=BATCH)
137 | print(train_dataset)
138 |
139 |
140 | device = '/cpu:0'
141 | print(device)
142 | batch_size = args.batch_size
143 | epochs = args.epochs
144 | learning_rate = args.learning_rate
145 | print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate))
146 |
147 | with tf.device(device):
148 |
149 | model = model()
150 | opt = tf.keras.optimizers.Nadam(LR)
151 | metrics = [dice_coef, Recall(), Precision()]
152 | model.compile(loss=dice_loss, optimizer=opt, metrics=metrics)
153 |
154 | callbacks = [
155 | ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4),
156 | EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=False)
157 | ]
158 |
159 | train_steps = len(train_x)//BATCH
160 | valid_steps = len(valid_x)//BATCH
161 |
162 | if len(train_x) % BATCH != 0:
163 | train_steps += 1
164 | if len(valid_x) % BATCH != 0:
165 | valid_steps += 1
166 | model.fit(
167 | train_dataset,
168 | validation_data=valid_dataset,
169 | epochs=EPOCHS,
170 | steps_per_epoch=train_steps,
171 | validation_steps=valid_steps,
172 | callbacks=callbacks
173 | )
174 | # evaluate on train set
175 | scores = model.evaluate(train_dataset,steps=train_steps)
176 | print("\ntrain bce :", scores)
177 |
178 | # evaluate on val set
179 | scores = model.evaluate(valid_dataset,steps=valid_steps)
180 | print("\nval bce :", scores)
181 |
182 | # save model
183 | #model.save(args.model_dir + '/1')
184 |
185 | #Save as .h5, neo supports only .h5 format for keras , set 'include_optimizer=False' to remove operators that do not compile
186 | filepath=args.model_dir + '/unet_mobilenetv2.h5'
187 | tf.keras.models.save_model(
188 | model, filepath, overwrite=True, include_optimizer=False, save_format='h5'#,
189 | #signatures=None, options=None, save_traces=True
190 | )
191 |
--------------------------------------------------------------------------------
/src/cloud/pipelines/semantic_segmentation/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import os
4 | import numpy as np
5 | import boto3
6 | import time
7 | import sagemaker
8 | import sagemaker.session
9 |
10 | from sagemaker.workflow.parameters import ParameterInteger, ParameterString
11 | from sagemaker.sklearn.processing import SKLearnProcessor
12 | from sagemaker.processing import ProcessingInput, ProcessingOutput
13 | from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig
14 | from sagemaker.workflow.properties import PropertyFile
15 | from sagemaker.inputs import TrainingInput
16 | from sagemaker.workflow.step_collections import RegisterModel
17 | from sagemaker.workflow.pipeline import Pipeline
18 |
19 | BASE_DIR = os.path.dirname(os.path.realpath(__file__))
20 |
21 | def get_session(region, default_bucket):
22 | """Gets the sagemaker session based on the region.
23 |
24 | Args:
25 | region: the aws region to start the session
26 | default_bucket: the bucket to use for storing the artifacts
27 |
28 | Returns:
29 | `sagemaker.session.Session instance
30 | """
31 |
32 | boto_session = boto3.Session(region_name=region)
33 |
34 | sagemaker_client = boto_session.client("sagemaker")
35 | runtime_client = boto_session.client("sagemaker-runtime")
36 | return sagemaker.session.Session(
37 | boto_session=boto_session,
38 | sagemaker_client=sagemaker_client,
39 | sagemaker_runtime_client=runtime_client,
40 | default_bucket=default_bucket,
41 | )
42 |
43 | def get_pipeline(
44 | region,
45 | role=None,
46 | default_bucket=None,
47 | pipeline_name="defect-detection-semantic-segmentation-pipeline",
48 | base_job_prefix="defect-detection-semantic-segmentation",
49 | ):
50 | """Gets a SageMaker ML Pipeline instance working with on DefectDetection data.
51 |
52 | Args:
53 | region: AWS region to create and run the pipeline.
54 | role: IAM role to create and run steps and pipeline.
55 | default_bucket: the bucket to use for storing the artifacts
56 |
57 | Returns:
58 | an instance of a pipeline
59 | """
60 | sagemaker_session = get_session(region, default_bucket)
61 | if role is None:
62 | role = sagemaker.session.get_execution_role(sagemaker_session)
63 |
64 | ## By enabling cache, if you run this pipeline again, without changing the input
65 | ## parameters it will skip the training part and reuse the previous trained model
66 | cache_config = CacheConfig(enable_caching=True, expire_after="30d")
67 | ts = time.strftime('%Y-%m-%d-%H-%M-%S')
68 |
69 | # Data prep
70 | processing_instance_type = ParameterString( # instance type for data preparation
71 | name="ProcessingInstanceType",
72 | default_value="ml.m5.xlarge"
73 | )
74 | processing_instance_count = ParameterInteger( # number of instances used for data preparation
75 | name="ProcessingInstanceCount",
76 | default_value=1
77 | )
78 |
79 | # Training
80 | training_instance_type = ParameterString( # instance type for training the model
81 | name="TrainingInstanceType",
82 | default_value="ml.c5.xlarge"
83 | )
84 | training_instance_count = ParameterInteger( # number of instances used to train your model
85 | name="TrainingInstanceCount",
86 | default_value=1
87 | )
88 | training_epochs = ParameterString(
89 | name="TrainingEpochs",
90 | default_value="100"
91 | )
92 |
93 | # Dataset input data: S3 path
94 | input_data = ParameterString(
95 | name="InputData",
96 | default_value="",
97 | )
98 |
99 | # Model Approval State
100 | model_approval_status = ParameterString(
101 | name="ModelApprovalStatus",
102 | default_value="PendingManualApproval"
103 | )
104 |
105 | # Model package group name for registering in model registry
106 | model_package_group_name = ParameterString(
107 | name="ModelPackageGroupName",
108 | default_value="defect-detection-semantic-segmentation-model-group"
109 | )
110 |
111 | # The preprocessor
112 | preprocessor = SKLearnProcessor(
113 | framework_version="0.23-1",
114 | role=role,
115 | instance_type=processing_instance_type,
116 | instance_count=processing_instance_count,
117 | max_runtime_in_seconds=7200,
118 | )
119 |
120 | # A preprocessing report to store some information from the preprocessing step for next steps
121 | preprocessing_report = PropertyFile(
122 | name='PreprocessingReport',
123 | output_name='preprocessing_report',
124 | path='preprocessing_report.json'
125 | )
126 |
127 | # Preprocessing Step
128 | step_process = ProcessingStep(
129 | name="DefectDetectionPreprocessing",
130 | code=os.path.join(BASE_DIR, 'preprocessing.py'), ## this is the script defined above
131 | processor=preprocessor,
132 | inputs=[
133 | ProcessingInput(source=input_data, destination='/opt/ml/processing/input')
134 | ],
135 | outputs=[
136 | ProcessingOutput(output_name='train_data', source='/opt/ml/processing/train'),
137 | ProcessingOutput(output_name='test_data', source='/opt/ml/processing/test'),
138 | ProcessingOutput(output_name='val_data', source='/opt/ml/processing/val'),
139 | ProcessingOutput(output_name='preprocessing_report', source='/opt/ml/processing/report')
140 | ],
141 | job_arguments=['--split', '0.1'],
142 | property_files=[preprocessing_report]
143 | )
144 |
145 | from sagemaker.tensorflow import TensorFlow
146 | model_dir = '/opt/ml/model'
147 | hyperparameters = {'epochs': training_epochs, 'batch_size': 8, 'learning_rate': 0.0001}
148 | estimator = TensorFlow(source_dir=BASE_DIR,
149 | entry_point='train_tf.py',
150 | model_dir=model_dir,
151 | instance_type=training_instance_type,
152 | #instance_type='local',
153 | instance_count=training_instance_count,
154 | hyperparameters=hyperparameters,
155 | role=role,
156 | output_path='s3://{}/{}/{}/{}'.format(default_bucket, 'models', base_job_prefix, 'training-output'),
157 | framework_version='2.2.0',
158 | py_version='py37',
159 | script_mode=True
160 | )
161 |
162 | step_train = TrainingStep(
163 | name="DefectDetectionSemanticSegmentationTrain",
164 | estimator=estimator,
165 | inputs={
166 | "train": TrainingInput(
167 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri,
168 | content_type='image/png',
169 | s3_data_type='S3Prefix'
170 | ),
171 | "validation": TrainingInput(
172 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["val_data"].S3Output.S3Uri,
173 | content_type='image/png',
174 | s3_data_type='S3Prefix'
175 | )
176 | },
177 | cache_config=cache_config
178 | )
179 |
180 | # Register model step that will be conditionally executed
181 | step_register = RegisterModel(
182 | name="DefectDetectionSemanticSegmentationRegister",
183 | estimator=estimator,
184 | model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
185 | content_types=["image/png"],
186 | response_types=["application/json"],
187 | inference_instances=["ml.c5.2xlarge", "ml.p3.2xlarge"],
188 | transform_instances=["ml.c5.xlarge"],
189 | model_package_group_name=model_package_group_name,
190 | approval_status=model_approval_status
191 | )
192 |
193 | pipeline = Pipeline(
194 | name=pipeline_name,
195 | parameters=[
196 | processing_instance_type,
197 | processing_instance_count,
198 | training_instance_type,
199 | training_instance_count,
200 | training_epochs,
201 | input_data,
202 | model_approval_status,
203 | model_package_group_name
204 | ],
205 | steps=[step_process, step_train, step_register],
206 | sagemaker_session=sagemaker_session,
207 | )
208 | return pipeline
209 |
--------------------------------------------------------------------------------
/src/cloud/pipelines/image_classification/preprocessing.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import sys
4 | import os
5 | import subprocess
6 |
7 | # Install packages previous to executing the rest of the script. You can also build your own custom container
8 | # with your individal dependecies if needed
9 | subprocess.check_call([sys.executable, "-m", "pip", "install", "Augmentor", "wget", "mxnet", "opencv-python"])
10 | os.system("apt-get update -y")
11 | os.system("apt-get install ffmpeg libsm6 libxext6 -y")
12 |
13 | import argparse
14 | import json
15 | import warnings
16 | import pandas as pd
17 | import numpy as np
18 | from glob import glob
19 | from datetime import datetime
20 | import shutil
21 | import wget
22 | from PIL import Image
23 | import Augmentor
24 |
25 | from sklearn.model_selection import train_test_split
26 |
27 |
28 | # Constants
29 |
30 | # the "folders" in the S3 bucket which define which images are good or bad
31 | PREFIX_NAME_NORMAL = 'normal'
32 | PREFIX_NAME_ANOMALOUS = 'anomalous'
33 |
34 |
35 | # Download im2rec.py tool for RecordIO conversion
36 | filename_im2rec_tool = wget.download("https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py")
37 |
38 | def augment_data(path, sample_count):
39 | """Augments the image dataset in the given path by adding rotation, zoom,
40 | brightness, contrast to the dataset"""
41 | p = Augmentor.Pipeline(path, output_directory=path)
42 |
43 | # Define augmentation operations
44 | #p.rotate(probability=0.4, max_left_rotation=8, max_right_rotation=8)
45 | #p.zoom(probability=0.3, min_factor=1.1, max_factor=1.3)
46 | p.random_brightness(probability=0.3, min_factor=0.4, max_factor=0.9)
47 | p.random_contrast(probability=0.2, min_factor=0.9, max_factor=1.1)
48 |
49 | p.sample(sample_count)
50 |
51 |
52 | def split_dataset(path, split=0.1):
53 | """Split the images into train-test-validation and move them into separate folder each (named train, test, val)"""
54 |
55 | label_map = { 'good': 0, 'bad': 1 }
56 | bad = sorted(glob(os.path.join(path, "%s/*" % PREFIX_NAME_ANOMALOUS)))
57 | good = sorted(glob(os.path.join(path, "%s/*" % PREFIX_NAME_NORMAL)))
58 |
59 | images = bad + good
60 | labels = ([label_map['bad']] * len(bad)) + ([label_map['good']] * len(good))
61 |
62 | total_size = len(images)
63 | valid_size = int(split * total_size)
64 | test_size = int(split * total_size)
65 | print('Total number of samples (normal and anomalous):', total_size)
66 |
67 | train_x, valid_x = train_test_split(images, test_size=valid_size, random_state=42)
68 | train_y, valid_y = train_test_split(labels, test_size=valid_size, random_state=42)
69 |
70 | train_x, test_x = train_test_split(train_x, test_size=test_size, random_state=42)
71 | train_y, test_y = train_test_split(train_y, test_size=test_size, random_state=42)
72 |
73 | return (train_x, train_y), (valid_x, valid_y), (test_x, test_y)
74 |
75 | def resize_images(path, width, height):
76 | """Resize all images in a given path (in-place). Please note that this method
77 | overwrites existing images in the path"""
78 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg'))
79 | for file in files:
80 | im = Image.open(file)
81 | im_resized = im.resize((width, height), Image.ANTIALIAS)
82 | im_resized.save(file)
83 |
84 |
85 | def get_square_image(img):
86 | """Returns a squared image by adding black padding"""
87 | padding_color = (0, 0, 0)
88 | width, height = img.size
89 | if width == height:
90 | return img
91 | elif width > height:
92 | result = Image.new(img.mode, (width, width), padding_color)
93 | result.paste(img, (0, (width - height) // 2))
94 | return result
95 | else:
96 | result = Image.new(img.mode, (height, height), padding_color)
97 | result.paste(img, ((height - width) // 2, 0))
98 | return result
99 |
100 | def square_images(path):
101 | """Squares all images in a given path (in-place). Please note that this
102 | method overwrites existing images in the path."""
103 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg'))
104 | for file in files:
105 | im = Image.open(file)
106 | im_squared = get_square_image(im)
107 | im_squared.save(file)
108 |
109 |
110 | if __name__=='__main__':
111 | parser = argparse.ArgumentParser()
112 | parser.add_argument('--augment-count-normal', type=int, default=0)
113 | parser.add_argument('--augment-count-anomalous', type=int, default=0)
114 | parser.add_argument('--image-width', type=int, default=224)
115 | parser.add_argument('--image-height', type=int, default=224)
116 | parser.add_argument('--split', type=float, default=0.1)
117 | args, _ = parser.parse_known_args()
118 |
119 | print('Received arguments {}'.format(args))
120 |
121 | # Define the paths
122 | input_data_base_path = '/opt/ml/processing/input'
123 | train_output_base_path = '/opt/ml/processing/train'
124 | test_output_base_path = '/opt/ml/processing/test'
125 | val_output_base_path = '/opt/ml/processing/val'
126 | report_output_base_path = '/opt/ml/processing/report'
127 | temp_data_base_path = 'opt/ml/processing/tmp'
128 |
129 | input_path_normal = os.path.join(input_data_base_path, PREFIX_NAME_NORMAL)
130 | input_path_anomalous = os.path.join(input_data_base_path, PREFIX_NAME_ANOMALOUS)
131 |
132 | # The images size used
133 | IMAGE_WIDTH = int(args.image_width)
134 | IMAGE_HEIGHT = int(args.image_height)
135 |
136 | # Augment images if needed
137 | # TODO: Only augment training images, not entire dataset!
138 | print('Augmenting images...')
139 | augment_data(input_path_normal, int(args.augment_count_normal))
140 | augment_data(input_path_anomalous, int(args.augment_count_anomalous))
141 |
142 | # Square all the images to ensure that only squared images exist in the training datset by adding a black padding around the image
143 | # IMPORTANT: Make sure you do the same when running inference
144 | print('Squaring all images that are not squared already...')
145 | square_images(input_path_normal)
146 | square_images(input_path_anomalous)
147 |
148 | # Resize the images in-place in the container image
149 | print('Resizing images...')
150 | resize_images(input_path_normal, IMAGE_WIDTH, IMAGE_HEIGHT)
151 | resize_images(input_path_anomalous, IMAGE_WIDTH, IMAGE_HEIGHT)
152 |
153 | # Create train test validation split
154 | # FIXME: only augment train dataset, not the test dataset!
155 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = split_dataset(input_data_base_path, split=float(args.split))
156 |
157 |
158 | # Create list files for RecordIO transformation
159 | base_dir_recordio = './'
160 |
161 | with open(base_dir_recordio+'train.lst', 'w+') as f:
162 | for indx, s in enumerate(train_x):
163 | f.write(f'{indx}\t{train_y[indx]}\t{s}\n')
164 |
165 | with open(base_dir_recordio+'val.lst', 'w+') as f:
166 | for indx, s in enumerate(valid_x):
167 | f.write(f'{indx}\t{valid_y[indx]}\t{s}\n')
168 |
169 | with open(base_dir_recordio+'test.lst', 'w+') as f:
170 | for indx, s in enumerate(test_x):
171 | f.write(f'{indx}\t{test_y[indx]}\t{s}\n')
172 |
173 | # Run im2rec.py file to convert to RecordIO
174 | print('Running im2rec.py tool for recordio conversion')
175 | os.system('python3 ./im2rec.py train.lst ./')
176 | os.system('python3 ./im2rec.py val.lst ./')
177 | os.system('python3 ./im2rec.py test.lst ./')
178 |
179 | # Copy to the output paths
180 | shutil.copy('train.rec', os.path.join(train_output_base_path, 'train.rec'))
181 | shutil.copy('val.rec', os.path.join(val_output_base_path, 'val.rec'))
182 | shutil.copy('test.rec', os.path.join(test_output_base_path, 'test.rec'))
183 |
184 | # Save the preprocessing report to make information available to downstream steps
185 | preprocessing_report = {
186 | 'preprocessing': {
187 | 'dataset': {
188 | 'num_training_samples': len(train_x),
189 | 'num_test_samples': len(test_x),
190 | 'num_val_samples': len(valid_x)
191 | }
192 | }
193 | }
194 | print('Preprocessing report:', preprocessing_report)
195 | report_output_path = os.path.join(report_output_base_path, 'preprocessing_report.json')
196 | with open(report_output_path, "w") as f:
197 | f.write(json.dumps(preprocessing_report))
198 |
199 |
--------------------------------------------------------------------------------
/src/edge/app/ota.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | # Source code partially modified from: https://github.com/aws-samples/amazon-sagemaker-edge-manager-demo/blob/main/04_EdgeApplication/turbine/ota.py
4 |
5 | import ssl
6 | import paho.mqtt.client as mqtt
7 | import logging
8 | import json
9 | import os
10 | import io
11 | import time
12 | import requests
13 | import boto3
14 | import tarfile
15 | import glob
16 | import threading
17 | import app
18 |
19 | class OTAModelUpdate(object):
20 | def __init__(self, device_name, iot_params, mqtt_host, mqtt_port, update_callback, model_path, models_supported):
21 | '''
22 | This class is responsible for listening to IoT topics and receiving
23 | a Json document with the metadata of a new model. This module also
24 | downloads the SageMaker Edge Manager deployment package, unpacks it to
25 | a local dir and also controls versioning.
26 | '''
27 | if model_path is None or update_callback is None:
28 | raise Exception("You need to inform a model_path and an update_callback methods")
29 | self.device_name = device_name
30 | self.model_path = model_path
31 | self.update_callback = update_callback
32 | self.iot_params = iot_params
33 | self.models_supported = models_supported
34 |
35 | ## initialize an mqtt client
36 | self.mqttc = mqtt.Client()
37 | self.mqttc.tls_set(
38 | iot_params['sagemaker_edge_provider_aws_ca_cert_file'],
39 | certfile=iot_params['sagemaker_edge_provider_aws_cert_file'],
40 | keyfile=iot_params['sagemaker_edge_provider_aws_cert_pk_file'],
41 | cert_reqs=ssl.CERT_REQUIRED, tls_version=ssl.PROTOCOL_TLSv1_2, ciphers=None
42 | )
43 | self.mqttc.enable_logger(logger=logging)
44 | self.mqttc.on_message = self.__on_message__
45 | self.mqttc.on_connect = self.__on_connect__
46 | self.mqttc.on_disconnect = self.__on_disconnect__
47 | self.connected = False
48 |
49 | self.processing_lock = threading.Lock()
50 | self.processed_jobs = []
51 |
52 | # start the mqtt client
53 | self.mqttc.connect(mqtt_host, mqtt_port, 45)
54 | self.mqttc.loop_start()
55 |
56 | def model_update_check(self):
57 | '''
58 | Check manually if there is a new model available
59 | '''
60 | if self.connected:
61 | self.mqttc.publish('$aws/things/%s/jobs/get' % self.device_name)
62 |
63 | def __on_message__(self, client, userdata, message):
64 | '''
65 | This callback is invoked by MQTTC each time a new message is published
66 | to one of the subscribed topics
67 | '''
68 | logging.debug("New message. Topic: %s; Message: %s;" % (message.topic, message.payload))
69 |
70 | if message.topic.endswith('notify'):
71 | self.model_update_check()
72 |
73 | elif message.topic.endswith('accepted'):
74 | resp = json.loads(message.payload)
75 | logging.debug(resp)
76 | if resp.get('queuedJobs') is not None: # request to list jobs
77 | # get the description of each queued job
78 | for j in resp['queuedJobs']:
79 | ## get the job description
80 | self.mqttc.publish('$aws/things/%s/jobs/%s/get' % ( self.device_name, j['jobId'] ) )
81 | break
82 | elif resp.get('inProgressJobs') is not None: # request to list jobs
83 | # get the description of each queued job
84 | for j in resp['inProgressJobs']:
85 | ## get the job description
86 | self.mqttc.publish('$aws/things/%s/jobs/%s/get' % ( self.device_name, j['jobId'] ) )
87 | break
88 | elif resp.get('execution') is not None: # request to get job description
89 | # check if this is a job description message
90 | job_meta = resp.get('execution')
91 |
92 | # we have the job metadata, let's process it
93 | self.__update_job_status__(job_meta['jobId'], 'IN_PROGRESS', 'Trying to get/load the model')
94 | self.__process_job__(job_meta['jobId'], job_meta['jobDocument'])
95 | else:
96 | logging.debug('Other message: ', resp)
97 |
98 | def __on_connect__(self, client, userdata, flags, rc):
99 | '''
100 | This callback is invoked just after MQTTC managed to connect
101 | to the MQTT endpoint
102 | '''
103 | self.connected = True
104 | logging.info("OTA Model Manager Connected to the MQTT endpoint!")
105 | self.mqttc.subscribe('$aws/things/%s/jobs/notify' % self.device_name)
106 | self.mqttc.subscribe('$aws/things/%s/jobs/accepted' % self.device_name)
107 | self.mqttc.subscribe('$aws/things/%s/jobs/rejected' % self.device_name)
108 | time.sleep(1)
109 | self.model_update_check()
110 |
111 | def __on_disconnect__(self, client, userdata, flags):
112 | '''
113 | This callback is invoked when MQTTC disconnected from the MQTT endpoint
114 | '''
115 | self.connected = False
116 | logging.info("OTA Model Manager Disconnected!")
117 |
118 | def __del__(self):
119 | '''
120 | Object destructor
121 | '''
122 | logging.info("OTA Model Manager Deleting this object")
123 | self.mqttc.loop_stop()
124 | self.mqttc.disconnect()
125 |
126 | def __update_job_status__(self, job_id, status, details):
127 | '''
128 | After receiving a new signal that there is a model to be deployed
129 | Update the IoT Job to inform the user the current status of this
130 | process
131 | '''
132 | payload = json.dumps({
133 | "status": status,
134 | "statusDetails": {"info": details },
135 | "includeJobExecutionState": False,
136 | "includeJobDocument": False,
137 | "stepTimeoutInMinutes": 2,
138 | })
139 | logging.info("Updating IoT job status: %s" % details)
140 | self.mqttc.publish('$aws/things/%s/jobs/%s/update' % ( self.device_name, job_id), payload)
141 |
142 |
143 | def __process_job__(self, job_id, msg):
144 | '''
145 | This method is responsible for:
146 | 1. validate the new model version
147 | 2. download the model package
148 | 3. unpack it to a local dir
149 | 4. notify the main application
150 | '''
151 | self.processing_lock.acquire()
152 | if job_id in self.processed_jobs:
153 | self.processing_lock.release()
154 | return
155 | self.processed_jobs.append(job_id)
156 | try:
157 | if msg.get('type') == 'new_model':
158 | model_version = msg['model_version']
159 | model_name = msg['model_name']
160 |
161 | # Check if the application supports the model with the name incoming
162 | if model_name not in self.models_supported:
163 | msg = 'New model %s from incoming deployment is not in list of supported models. Skipping deployment.' % model_name
164 | logging.info(msg)
165 | self.__update_job_status__(job_id, 'FAILED', msg)
166 | self.processing_lock.release()
167 | return
168 |
169 | logging.info("Downloading new model package")
170 | s3_client = app.get_client('s3', self.iot_params)
171 |
172 | package = io.BytesIO(s3_client.get_object(
173 | Bucket=msg['model_package_bucket'],
174 | Key=msg['model_package_key'])['Body'].read()
175 | )
176 | logging.info("Unpacking model package")
177 | with tarfile.open(fileobj=package) as p:
178 | p.extractall(os.path.join(self.model_path, msg['model_name'], msg['model_version']))
179 |
180 | self.__update_job_status__(job_id, 'SUCCEEDED', 'Model deployed')
181 | self.update_callback(model_name, model_version)
182 | else:
183 | logging.info("Model '%s' version '%f' is the current one or it is obsolete" % (self.model_metadata['model_name'], self.model_metadata['model_version']))
184 | except Exception as e:
185 | self.__update_job_status__(job_id, 'FAILED', str(e))
186 | logging.error(e)
187 |
188 | self.processing_lock.release()
--------------------------------------------------------------------------------
/src/cloud/pipelines/semantic_segmentation/preprocessing.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import sys
4 | import os
5 | import subprocess
6 |
7 | # Install packages previous to executing the rest of the script. You can also build your own custom container
8 | # with your individual dependencies if needed
9 | subprocess.check_call([sys.executable, "-m", "pip", "install", "wget", "opencv-python","albumentations","tqdm"])
10 | os.system("apt-get update")
11 | os.system("apt-get install ffmpeg libsm6 libxext6 -y")
12 |
13 | import argparse
14 | import json
15 | from glob import glob
16 | import shutil
17 | from PIL import Image
18 | from pathlib import Path
19 |
20 | import cv2
21 | from tqdm import tqdm
22 | from albumentations import CenterCrop, RandomRotate90, GridDistortion, HorizontalFlip, VerticalFlip
23 |
24 | from sklearn.model_selection import train_test_split
25 |
26 |
27 | # Constants
28 |
29 | # the "folders" in the S3 bucket for images and their ground truth masks
30 | PREFIX_NAME_IMAGE = 'images'
31 | PREFIX_NAME_MASK = 'masks'
32 |
33 | # The images size used
34 | IMAGE_WIDTH = 224
35 | IMAGE_HEIGHT = 224
36 |
37 | def augment_data(path, augment=True):
38 | save_path = path
39 | images = sorted(glob(os.path.join(path, PREFIX_NAME_IMAGE + "/*")))
40 | masks = sorted(glob(os.path.join(path, PREFIX_NAME_MASK + "/*")))
41 |
42 | for x, y in tqdm(zip(images, masks), total=len(images)):
43 | name = x.split("/")[-1].split(".")
44 |
45 | img_name = name[0]
46 | image_extn = name[1]
47 |
48 | name = y.split("/")[-1].split(".")
49 | mask_name = name[0]
50 | mask_extn = name[1]
51 |
52 | # Read image mask
53 | x = cv2.imread(x, cv2.IMREAD_COLOR)
54 | y = cv2.imread(y, cv2.IMREAD_COLOR)
55 |
56 | # Augment dataset
57 | if augment == True:
58 | aug = RandomRotate90(p=1.0)
59 | augmented = aug(image=x, mask=y)
60 | x1 = augmented['image']
61 | y1 = augmented['mask']
62 |
63 | aug = RandomRotate90(p=1.0)
64 | augmented = aug(image=x, mask=y)
65 | x2 = augmented['image']
66 | y2 = augmented['mask']
67 |
68 | aug = GridDistortion(p=1.0)
69 | augmented = aug(image=x, mask=y)
70 | x3 = augmented['image']
71 | y3 = augmented['mask']
72 |
73 | aug = HorizontalFlip(p=1.0)
74 | augmented = aug(image=x, mask=y)
75 | x4 = augmented['image']
76 | y4 = augmented['mask']
77 |
78 | aug = VerticalFlip(p=1.0)
79 | augmented = aug(image=x, mask=y)
80 | x5 = augmented['image']
81 | y5 = augmented['mask']
82 |
83 | save_images = [x, x1, x2, x3, x4, x5]
84 | save_masks = [y, y1, y2, y3, y4, y5]
85 |
86 | else:
87 | save_images = [x]
88 | save_masks = [y]
89 |
90 | """ Saving the image and mask. """
91 | idx = 0
92 | Path(save_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True)
93 | Path(save_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True)
94 | for i, m in zip(save_images, save_masks):
95 | i = cv2.resize(i, (IMAGE_WIDTH, IMAGE_HEIGHT))
96 | m = cv2.resize(m, (IMAGE_WIDTH, IMAGE_HEIGHT))
97 |
98 | if len(images) == 1:
99 | tmp_img_name = f"{img_name}.{image_extn}"
100 | tmp_mask_name = f"{mask_name}.{mask_extn}"
101 | else:
102 | tmp_img_name = f"{img_name}_{idx}.{image_extn}"
103 | tmp_mask_name = f"{mask_name}_{idx}.{mask_extn}"
104 |
105 | image_path = os.path.join(save_path, PREFIX_NAME_IMAGE, tmp_img_name)
106 | mask_path = os.path.join(save_path, PREFIX_NAME_MASK, tmp_mask_name)
107 |
108 | cv2.imwrite(image_path, i)
109 | cv2.imwrite(mask_path, m)
110 |
111 | idx += 1
112 |
113 |
114 | def resize_images(path, width, height):
115 | """Resize all images in a given path (in-place). Please note that this method
116 | overwrites existing images in the path"""
117 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg'))
118 | for file in files:
119 | im = Image.open(file)
120 | im_resized = im.resize((width, height), Image.ANTIALIAS)
121 | im_resized.save(file)
122 |
123 | def get_square_image(img, padding_color=(0, 0, 0)):
124 | """Returns a squared image by adding black padding"""
125 | width, height = img.size
126 | if width == height:
127 | return img
128 | elif width > height:
129 | result = Image.new(img.mode, (width, width), padding_color)
130 | result.paste(img, (0, (width - height) // 2))
131 | return result
132 | else:
133 | result = Image.new(img.mode, (height, height), padding_color)
134 | result.paste(img, ((height - width) // 2, 0))
135 | return result
136 |
137 | def square_images(path, padding_color=(0,0,0)):
138 | """Squares all images in a given path (in-place). Please note that this
139 | method overwrites existing images in the path."""
140 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg'))
141 | for file in files:
142 | im = Image.open(file)
143 | im_squared = get_square_image(im, padding_color)
144 | im_squared.save(file)
145 |
146 | def load_data(path, split=0.1):
147 | images = sorted(glob(os.path.join(path, PREFIX_NAME_IMAGE + "/*")))
148 | masks = sorted(glob(os.path.join(path, PREFIX_NAME_MASK + "/*")))
149 |
150 | total_size = len(images)
151 | valid_size = int(split * total_size)
152 | test_size = int(split * total_size)
153 | print(total_size)
154 | train_x, valid_x = train_test_split(images, test_size=valid_size, random_state=42)
155 | train_y, valid_y = train_test_split(masks, test_size=valid_size, random_state=42)
156 |
157 | train_x, test_x = train_test_split(train_x, test_size=test_size, random_state=42)
158 | train_y, test_y = train_test_split(train_y, test_size=test_size, random_state=42)
159 |
160 | return (train_x, train_y), (valid_x, valid_y), (test_x, test_y)
161 |
162 |
163 | if __name__=='__main__':
164 | parser = argparse.ArgumentParser()
165 | parser.add_argument('--split', type=float, default=0.1)
166 | args, _ = parser.parse_known_args()
167 |
168 | print('Received arguments {}'.format(args))
169 |
170 | # Define the paths
171 | input_data_base_path = '/opt/ml/processing/input'
172 | train_output_base_path = '/opt/ml/processing/train'
173 | test_output_base_path = '/opt/ml/processing/test'
174 | val_output_base_path = '/opt/ml/processing/val'
175 | report_output_base_path = '/opt/ml/processing/report'
176 |
177 | #Augment images and save in new directory
178 | augment_data(input_data_base_path)
179 |
180 | print('Squaring images...')
181 | square_images(os.path.join(input_data_base_path, PREFIX_NAME_IMAGE))
182 | square_images(os.path.join(input_data_base_path, PREFIX_NAME_MASK), padding_color=(0))
183 |
184 | # Resize the images in-place in the container image
185 | print('Resizing images...')
186 | resize_images(os.path.join(input_data_base_path, PREFIX_NAME_IMAGE), IMAGE_WIDTH, IMAGE_HEIGHT)
187 | resize_images(os.path.join(input_data_base_path, PREFIX_NAME_MASK), IMAGE_WIDTH, IMAGE_HEIGHT)
188 |
189 | # Create train test validation split
190 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_data(input_data_base_path, split=float(args.split))
191 |
192 | # Copy to the output paths
193 | Path(train_output_base_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True)
194 | Path(train_output_base_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True)
195 | Path(val_output_base_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True)
196 | Path(val_output_base_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True)
197 | Path(test_output_base_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True)
198 | Path(test_output_base_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True)
199 | for file in train_x :
200 | shutil.copy(file, os.path.join(train_output_base_path, PREFIX_NAME_IMAGE + '/' + os.path.basename(file)))
201 | for file in train_y :
202 | shutil.copy(file, os.path.join(train_output_base_path, PREFIX_NAME_MASK + '/'+ os.path.basename(file)))
203 | for file in valid_x :
204 | shutil.copy(file, os.path.join(val_output_base_path, PREFIX_NAME_IMAGE + '/'+ os.path.basename(file)))
205 | for file in valid_y :
206 | shutil.copy(file, os.path.join(val_output_base_path, PREFIX_NAME_MASK + '/'+ os.path.basename(file)))
207 | for file in test_x :
208 | shutil.copy(file, os.path.join(test_output_base_path, PREFIX_NAME_IMAGE + '/'+ os.path.basename(file)))
209 | for file in test_y :
210 | shutil.copy(file, os.path.join(test_output_base_path, PREFIX_NAME_MASK + '/'+ os.path.basename(file)))
211 | # Save the preprocessing report to make information available to downstream steps
212 | preprocessing_report = {
213 | 'preprocessing': {
214 | 'dataset': {
215 | 'num_training_samples': len(train_x),
216 | 'num_test_samples': len(test_x),
217 | 'num_val_samples': len(valid_x)
218 | }
219 | }
220 | }
221 | print('Preprocessing report:', preprocessing_report)
222 | report_output_path = os.path.join(report_output_base_path, 'preprocessing_report.json')
223 | with open(report_output_path, "w") as f:
224 | f.write(json.dumps(preprocessing_report))
225 |
226 |
227 |
--------------------------------------------------------------------------------
/src/edge/run.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import os
4 | import numpy as np
5 | import json
6 | import logging
7 | import PIL.Image
8 | import glob
9 | import random
10 | import re
11 | from timeit import default_timer as timer
12 |
13 | from flask import Flask
14 | from flask import render_template
15 | from waitress import serve
16 | flask_app = Flask(__name__)
17 |
18 | import app
19 |
20 | # Get environment variables
21 | if not 'SM_EDGE_AGENT_HOME' in os.environ:
22 | logging.error('You need to define the environment variable SM_EDGE_AGENT_HOME')
23 | raise Exception('Environment variable not defined')
24 |
25 | if not 'SM_APP_ENV' in os.environ:
26 | logging.error('You need to define the environment variable SM_APP_ENV as either "prod" or "dev"')
27 | raise Exception('Environment variable not defined')
28 |
29 | # Configuration constants
30 | SM_EDGE_AGENT_HOME = os.environ['SM_EDGE_AGENT_HOME']
31 | AGENT_SOCKET = '/tmp/edge_agent'
32 | SM_EDGE_MODEL_PATH = os.path.join(SM_EDGE_AGENT_HOME, 'model/dev')
33 | SM_EDGE_CONFIGFILE_PATH = os.path.join(SM_EDGE_AGENT_HOME, 'conf/config_edge_device.json')
34 | CONFIG_FILE_PATH = './models_config.json'
35 | SM_APP_ENV = os.environ['SM_APP_ENV']
36 | IMG_WIDTH = 224
37 | IMG_HEIGHT = 224
38 |
39 | # Set up logging
40 | logging.basicConfig(level=logging.INFO)
41 | logging.debug('Initializing...')
42 |
43 | # Loading config file
44 | with open(CONFIG_FILE_PATH, 'r') as f:
45 | config = json.load(f)
46 |
47 | # Load SM Edge Agent config file
48 | iot_params = json.loads(open(SM_EDGE_CONFIGFILE_PATH, 'r').read())
49 |
50 | # Retrieve the IoT thing name associated with the edge device
51 | iot_client = app.get_client('iot', iot_params)
52 | sm_client = app.get_client('sagemaker', iot_params)
53 |
54 | resp = sm_client.describe_device(
55 | DeviceName=iot_params['sagemaker_edge_core_device_name'],
56 | DeviceFleetName=iot_params['sagemaker_edge_core_device_fleet_name']
57 | )
58 | device_name = resp['IotThingName']
59 | mqtt_host = iot_client.describe_endpoint(endpointType='iot:Data-ATS')['endpointAddress']
60 | mqtt_port = 8883
61 |
62 | # Send logs to cloud via MQTT topics
63 | logger = app.Logger(device_name, iot_params)
64 |
65 | # Initialize the Edge Manager agent
66 | edge_agent = app.EdgeAgentClient(AGENT_SOCKET)
67 |
68 | # A list of names of loaded models with their name, version and identifier
69 | models_loaded = []
70 |
71 | def create_model_identifier(name, version):
72 | """Get a compatible string as a combination of name and version"""
73 | new_name = "%s-%s" % (name, str(version).replace('.', '-'))
74 | return new_name
75 |
76 | def get_model_from_name(name, config_dict):
77 | """Returns the model dict from the config dict"""
78 | model_obj = next((x for x in config_dict['models'] if x['name'] == name), None)
79 | if model_obj is not None:
80 | return model_obj
81 | else:
82 | logging.warning('Model object not found in config')
83 | return None
84 |
85 | def get_square_image(img):
86 | """Returns a squared image by adding black padding"""
87 | padding_color = (0, 0, 0)
88 | width, height = img.size
89 | if width == height:
90 | return img
91 | elif width > height:
92 | result = PIL.Image.new(img.mode, (width, width), padding_color)
93 | result.paste(img, (0, (width - height) // 2))
94 | return result
95 | else:
96 | result = PIL.Image.new(img.mode, (height, height), padding_color)
97 | result.paste(img, ((height - width) // 2, 0))
98 | return result
99 |
100 |
101 | def preprocess_image(img, img_width, img_height):
102 | """Preprocesses the image before feeding it into the ML model"""
103 | x = get_square_image(img)
104 | x = np.asarray(img.resize((img_width, img_height))).astype(np.float32)
105 | x_transposed = x.transpose((2,0,1))
106 | x_batchified = np.expand_dims(x_transposed, axis=0)
107 | return x_batchified
108 |
109 | # Setup model callback method
110 | def load_model(name, version):
111 | """Loads the model into the edge agent and unloads previous versions if any."""
112 | global models_loaded
113 | version = str(version)
114 | # Create a model name string as a concatenation of name and version
115 | identifier = "%s-%s" % (name, version.replace('.', '-'))
116 |
117 | # Check if previous version of this model was loaded already and unload it
118 | matching_model_dict = next((m for m in models_loaded if m['name'] == name), None)
119 | if matching_model_dict:
120 | logging.info('Previous version of new model found: %s' % matching_model_dict)
121 |
122 | # Check if version is higher
123 | if float(version) <= float(matching_model_dict['version']):
124 | logging.info('New model version is not higher than previous version. Not loading incoming model.')
125 | return
126 |
127 | logging.info('Loading model into edge agent: %s' % identifier)
128 | resp = edge_agent.load_model(identifier, os.path.join(SM_EDGE_MODEL_PATH, name, version))
129 | if resp is None:
130 | logging.error('It was not possible to load the model. Is the agent running?')
131 | return
132 | else:
133 | models_loaded.append({'name': name, 'version': version, 'identifier': identifier})
134 | logging.info('Sucessfully loaded new model version into agent')
135 | if matching_model_dict:
136 | logging.info('Unloading previous model version')
137 | edge_agent.unload_model(matching_model_dict['identifier'])
138 | models_loaded.remove(matching_model_dict)
139 |
140 | def run_segmentation_inference(agent, filename):
141 | """Runs inference on the given image file. Returns prediction and model latency."""
142 |
143 | # Check if model for segmentation is downloaded
144 | model_name_img_seg = config['mappings']['image-segmentation-app']
145 | model_is_loaded = any([m['name']==model_name_img_seg for m in models_loaded])
146 | if not model_is_loaded:
147 | logging.info('Model for image segmentation not loaded, waiting for deployment...')
148 | return None, None
149 |
150 | # Get the identifier of the currently loaded model
151 | model_dict_img_seg = next((x for x in models_loaded if x['name'] == model_name_img_seg), None)
152 | if not model_dict_img_seg:
153 | logging.info('Model for image segmentation not loaded, waiting for deployment...')
154 | return None, None
155 | model_id_img_seg = model_dict_img_seg['identifier']
156 |
157 | logging.info('\nSegmentation inference with file %s and model %s' % (filename, model_id_img_seg))
158 | image = PIL.Image.open(filename)
159 | image = image.convert(mode='RGB')
160 |
161 | # Preprocessing
162 | x_batchified = preprocess_image(image, IMG_WIDTH, IMG_HEIGHT)
163 |
164 | # Fit into 0-1 range, as the unet model expects this
165 | x_batchified = x_batchified/255.0
166 |
167 | # Run inference
168 | t_start = timer()
169 | y = agent.predict(model_id_img_seg, x_batchified)
170 | t_stop = timer()
171 | t_ms = np.round((t_stop - t_start) * 1000, decimals=0)
172 |
173 | y_mask = y[0] > 0.5
174 | agent.capture_data(model_id_img_seg, x_batchified, y.astype(np.float32))
175 |
176 | return y_mask, t_ms
177 |
178 |
179 | def run_classification_inference(agent, filename):
180 | """Runs inference on the given image file. Returns prediction and model latency."""
181 | # Check if the model for image classification is available
182 | # The application always uses the latest version of the model in the list of loaded models
183 | model_name_img_clf = config['mappings']['image-classification-app']
184 | model_is_loaded = any([m['name']==model_name_img_clf for m in models_loaded])
185 | if not model_is_loaded:
186 | logging.info('Model for image classification not loaded, waiting for deployment...')
187 | return None, None
188 |
189 | # Get the identifier of the currently loaded model
190 | model_dict_img_clf = next((x for x in models_loaded if x['name'] == model_name_img_clf), None)
191 | if not model_dict_img_clf:
192 | logging.info('Model for image classification not loaded, waiting for deployment...')
193 | return None, None
194 | model_id_img_clf = model_dict_img_clf['identifier']
195 |
196 | logging.info('\nClassification inference with %s' % filename)
197 | image = PIL.Image.open(filename)
198 | image = image.convert(mode='RGB')
199 |
200 | # Preprocessing
201 | x_batchified = preprocess_image(image, IMG_WIDTH, IMG_HEIGHT)
202 |
203 | # Run inference with agent and time taken
204 | t_start = timer()
205 | y = agent.predict(model_id_img_clf, x_batchified)
206 | t_stop = timer()
207 | t_ms = np.round((t_stop - t_start) * 1000, decimals=0)
208 |
209 | agent.capture_data(model_id_img_clf, x_batchified, y)
210 | y = y.ravel()
211 | logging.info(y)
212 |
213 | img_clf_class_labels = ['normal', 'anomalous']
214 |
215 | for indx, l in enumerate(img_clf_class_labels):
216 | logging.info('Class probability label "%s": %f' % (img_clf_class_labels[indx], y[indx]))
217 | return y, t_ms
218 |
219 |
220 | # Get list of supported model names
221 | models_supported = config['mappings'].values()
222 |
223 | # Initialize OTA model manager
224 | model_manager = app.OTAModelUpdate(device_name, iot_params, mqtt_host, mqtt_port, load_model, SM_EDGE_MODEL_PATH, models_supported)
225 |
226 | @flask_app.route('/')
227 | def homepage():
228 | # Get a random image from the directory
229 | list_img_inf = glob.glob('./static/**/*.png')
230 |
231 | if len(list_img_inf) == 0:
232 | return render_template('main_noimg.html',
233 | loaded_models=models_loaded
234 | )
235 |
236 | inference_img_path = random.choice(list_img_inf)
237 | inference_img_filename = re.search(r'(?<=\/static\/).+$', inference_img_path)[0]
238 |
239 | # Run inference on this image
240 | y_clf, t_ms_clf = run_classification_inference(edge_agent, inference_img_path)
241 | y_segm, t_ms_segm = run_segmentation_inference(edge_agent, inference_img_path)
242 |
243 | # Synthesize mask into binary image
244 | if y_segm is not None:
245 | segm_img_encoded = app.create_b64_img_from_mask(y_segm)
246 | segm_img_decoded_utf8 = segm_img_encoded.decode('utf-8')
247 | logging.info('Model latency: t_segm=%fms' % t_ms_segm)
248 | else:
249 | segm_img_encoded = None
250 | segm_img_decoded_utf8 = None
251 |
252 | # Extract predictions from the y array
253 | # Assuming that the entry at index=0 is the probability for "normal" and the other for "anomalous"
254 | clf_class_labels = ['normal', 'anomalous']
255 | if y_clf is not None:
256 | y_clf_normal = np.round(y_clf[0], decimals=6)
257 | y_clf_anomalous = np.round(y_clf[1], decimals=6)
258 | y_clf_class = clf_class_labels[np.argmax(y_clf)]
259 | logging.info('Model latency: t_classification=%fms' % t_ms_clf)
260 | else:
261 | y_clf_normal = None
262 | y_clf_anomalous = None
263 | y_clf_class = None
264 |
265 |
266 | # Return rendered HTML page with predictions
267 | return render_template('main.html',
268 | loaded_models=models_loaded,
269 | image_file=inference_img_filename,
270 | y_clf_normal=y_clf_normal,
271 | y_clf_anomalous=y_clf_anomalous,
272 | y_clf_class=y_clf_class,
273 | y_segm_img=segm_img_decoded_utf8,
274 | latency_clf=t_ms_clf,
275 | latency_segm=t_ms_segm
276 | )
277 |
278 | # INIT APP
279 | # Initially load models as defined in config file
280 | for model_config in config['models']:
281 | model_name = model_config['name']
282 | model_version = model_config['version']
283 | try:
284 | load_model(model_name, model_version)
285 | except Exception as e:
286 | logging.error('Model could not be loaded. Did you specify it properly in the config file?')
287 | raise e
288 |
289 |
290 | if __name__ == '__main__':
291 | try:
292 | if SM_APP_ENV == 'prod':
293 | serve(flask_app, host='0.0.0.0', port=8080)
294 | elif SM_APP_ENV == 'dev':
295 | flask_app.run(debug=False, use_reloader=False, host='0.0.0.0', port=8080)
296 | else:
297 | raise Exception('SM_APP_ENV needs to be either "prod" or "dev"')
298 |
299 | except KeyboardInterrupt as e:
300 | pass
301 | except Exception as e:
302 | logging.error(e)
303 |
304 | logging.info('Shutting down')
305 |
306 | for m in models_loaded:
307 | logging.info("Unloading model %s" % m)
308 | edge_agent.unload_model(m['identifier'])
309 |
310 |
311 | # Updating config file
312 | config['models'] = models_loaded
313 |
314 | with open(CONFIG_FILE_PATH, 'w') as f:
315 | json.dump(config, f)
316 |
317 | del model_manager
318 | del edge_agent
319 | del logger
--------------------------------------------------------------------------------
/src/cloud/pipelines/image_classification/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | import os
4 | import numpy as np
5 | import glob
6 | import boto3
7 | import time
8 | import sagemaker
9 | import sagemaker.session
10 |
11 | from sagemaker.workflow.parameters import ParameterInteger, ParameterString
12 | from sagemaker.sklearn.processing import SKLearnProcessor
13 | from sagemaker.estimator import Estimator
14 | from sagemaker.processing import ProcessingInput, ProcessingOutput
15 | from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep, CacheConfig
16 | from sagemaker.workflow.properties import PropertyFile
17 | from sagemaker.workflow.functions import Join, JsonGet
18 | from sagemaker.workflow.condition_step import ConditionStep
19 | from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
20 | from sagemaker.inputs import TrainingInput, CreateModelInput
21 | from sagemaker.workflow.step_collections import RegisterModel
22 | from sagemaker.model_metrics import MetricsSource, ModelMetrics
23 | from sagemaker.model import Model
24 | from sagemaker.workflow.pipeline import Pipeline
25 | from sagemaker.image_uris import retrieve
26 |
27 | from botocore.exceptions import ClientError, ValidationError
28 |
29 | BASE_DIR = os.path.dirname(os.path.realpath(__file__))
30 |
31 | def get_session(region, default_bucket):
32 | """Gets the sagemaker session based on the region.
33 |
34 | Args:
35 | region: the aws region to start the session
36 | default_bucket: the bucket to use for storing the artifacts
37 |
38 | Returns:
39 | `sagemaker.session.Session instance
40 | """
41 |
42 | boto_session = boto3.Session(region_name=region)
43 |
44 | sagemaker_client = boto_session.client("sagemaker")
45 | runtime_client = boto_session.client("sagemaker-runtime")
46 | return sagemaker.session.Session(
47 | boto_session=boto_session,
48 | sagemaker_client=sagemaker_client,
49 | sagemaker_runtime_client=runtime_client,
50 | default_bucket=default_bucket,
51 | )
52 |
53 | def get_pipeline(
54 | region,
55 | role=None,
56 | default_bucket=None,
57 | pipeline_name="defect-detection-img-classification-pipeline",
58 | base_job_prefix="defect-detection-img-classification",
59 | ):
60 | """Gets a SageMaker ML Pipeline instance working with on DefectDetection data.
61 |
62 | Args:
63 | region: AWS region to create and run the pipeline.
64 | role: IAM role to create and run steps and pipeline.
65 | default_bucket: the bucket to use for storing the artifacts
66 |
67 | Returns:
68 | an instance of a pipeline
69 | """
70 | sagemaker_session = get_session(region, default_bucket)
71 | if role is None:
72 | role = sagemaker.session.get_execution_role(sagemaker_session)
73 |
74 | ## By enabling cache, if you run this pipeline again, without changing the input
75 | ## parameters it will skip the training part and reuse the previous trained model
76 | cache_config = CacheConfig(enable_caching=True, expire_after="30d")
77 | ts = time.strftime('%Y-%m-%d-%H-%M-%S')
78 |
79 | # Data prep
80 | processing_instance_type = ParameterString( # instance type for data preparation
81 | name="ProcessingInstanceType",
82 | default_value="ml.m5.xlarge"
83 | )
84 | processing_instance_count = ParameterInteger( # number of instances used for data preparation
85 | name="ProcessingInstanceCount",
86 | default_value=1
87 | )
88 |
89 | # Input shape
90 | # --> Image size (height and width, as we need only use square images) desired for training. The
91 | # pipeline will square the images to this size if they are not square already by adding padding.
92 | target_image_size = ParameterString(
93 | name="TargetImageSize",
94 | default_value="224"
95 | )
96 |
97 | # Augement Count
98 | augment_count_normal = ParameterString( # by how many samples you want to augment the normal samples
99 | name="AugmentCountNormal",
100 | default_value="0"
101 | )
102 | augment_count_anomalous = ParameterString( # by how many samples you want to augment the anomalous samples
103 | name="AugmentCountAnomalous",
104 | default_value="0"
105 | )
106 |
107 | # Training
108 | training_instance_type = ParameterString( # instance type for training the model
109 | name="TrainingInstanceType",
110 | default_value="ml.p3.2xlarge"
111 | )
112 | training_instance_count = ParameterInteger( # number of instances used to train your model
113 | name="TrainingInstanceCount",
114 | default_value=1
115 | )
116 | training_epochs = ParameterString(
117 | name="TrainingEpochs",
118 | default_value="15"
119 | )
120 | training_num_training_samples = ParameterString(
121 | name="TrainingNumTrainingSamples",
122 | default_value="3600" # Change this to the number of training samples used!
123 | )
124 |
125 | # Dataset input data: S3 path
126 | input_data = ParameterString(
127 | name="InputData",
128 | default_value="",
129 | )
130 |
131 | # Model Approval State
132 | model_approval_status = ParameterString(
133 | name="ModelApprovalStatus",
134 | default_value="PendingManualApproval"
135 | )
136 |
137 | # Model package group name for registering in model registry
138 | model_package_group_name = ParameterString(
139 | name="ModelPackageGroupName",
140 | default_value="defect-detection-img-classification-model-group"
141 | )
142 |
143 |
144 | aws_region = sagemaker_session.boto_region_name
145 | training_image = retrieve(framework='image-classification', region=aws_region, image_scope='training')
146 |
147 | # Hardcoded hyperparameters
148 | NUM_CLASSES = 2
149 | BATCH_SIZE = 8
150 |
151 | # The preprocessor
152 | preprocessor = SKLearnProcessor(
153 | framework_version="0.23-1",
154 | role=role,
155 | instance_type=processing_instance_type,
156 | instance_count=processing_instance_count,
157 | max_runtime_in_seconds=7200,
158 | )
159 |
160 | # A preprocessing report to store some information from the preprocessing step for next steps
161 | preprocessing_report = PropertyFile(
162 | name='PreprocessingReport',
163 | output_name='preprocessing_report',
164 | path='preprocessing_report.json'
165 | )
166 |
167 | # Preprocessing Step
168 | step_process = ProcessingStep(
169 | name="DefectDetectionPreprocessing",
170 | code=os.path.join(BASE_DIR, 'preprocessing.py'), ## this is the script defined above
171 | processor=preprocessor,
172 | inputs=[
173 | ProcessingInput(source=input_data, destination='/opt/ml/processing/input')
174 | ],
175 | outputs=[
176 | ProcessingOutput(output_name='train_data', source='/opt/ml/processing/train'),
177 | ProcessingOutput(output_name='test_data', source='/opt/ml/processing/test'),
178 | ProcessingOutput(output_name='val_data', source='/opt/ml/processing/val'),
179 | ProcessingOutput(output_name='preprocessing_report', source='/opt/ml/processing/report')
180 | ],
181 | job_arguments=[
182 | '--split', '0.1',
183 | '--augment-count-normal', augment_count_normal,
184 | '--augment-count-anomalous', augment_count_anomalous,
185 | '--image-width', target_image_size,
186 | '--image-height', target_image_size
187 | ],
188 | property_files=[preprocessing_report]
189 | )
190 |
191 | # Define Image Classification Estimator
192 | hyperparameters = {
193 | 'num_layers': 18,
194 | 'image_shape': Join(on=',', values=['3', target_image_size, target_image_size]),
195 | 'num_classes': NUM_CLASSES,
196 | 'mini_batch_size': BATCH_SIZE,
197 | 'num_training_samples': training_num_training_samples,
198 | 'epochs': training_epochs,
199 | 'learning_rate': 0.01,
200 | 'top_k': 2,
201 | 'use_pretrained_model': 1,
202 | 'precision_dtype': 'float32'
203 | }
204 |
205 | ic_estimator = Estimator(
206 | image_uri=training_image,
207 | role=role,
208 | instance_count=training_instance_count,
209 | instance_type=training_instance_type,
210 | volume_size = 50,
211 | max_run = 360000,
212 | input_mode= 'Pipe',
213 | base_job_name='img-classification-training',
214 | output_path='s3://{}/{}/{}/{}'.format(default_bucket, 'models', base_job_prefix, 'training-output'),
215 | hyperparameters=hyperparameters
216 | )
217 |
218 | step_train = TrainingStep(
219 | name="DefectDetectionImageClassificationTrain",
220 | estimator=ic_estimator,
221 | inputs={
222 | "train": TrainingInput(
223 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri,
224 | content_type="application/x-recordio",
225 | s3_data_type='S3Prefix'
226 | ),
227 | "validation": TrainingInput(
228 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["val_data"].S3Output.S3Uri,
229 | content_type="application/x-recordio",
230 | s3_data_type='S3Prefix'
231 | )
232 | },
233 | cache_config=cache_config
234 | )
235 |
236 | # Set up for the evaluation processing step
237 | evaluation_report = PropertyFile(
238 | name='EvaluationReport',
239 | output_name='evaluation_report',
240 | path='evaluation_report.json'
241 | )
242 |
243 | evalation_processor = SKLearnProcessor(
244 | framework_version="0.23-1",
245 | role=role,
246 | instance_type=processing_instance_type,
247 | instance_count=processing_instance_count,
248 | max_runtime_in_seconds=7200
249 | )
250 |
251 | step_eval = ProcessingStep(
252 | name="DefectDetectionEvaluation",
253 | code=os.path.join(BASE_DIR, 'evaluation.py'), ## this is the script defined above
254 | processor=evalation_processor,
255 | inputs=[
256 | ProcessingInput(source=step_process.properties.ProcessingOutputConfig.Outputs["test_data"].S3Output.S3Uri, destination='/opt/ml/processing/test'),
257 | ProcessingInput(source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination='/opt/ml/processing/model')
258 |
259 | ],
260 | outputs=[
261 | ProcessingOutput(output_name='evaluation_report', source='/opt/ml/processing/report')
262 | ],
263 | property_files=[evaluation_report],
264 | job_arguments=[
265 | '--image-width', target_image_size,
266 | '--image-height', target_image_size
267 | ],
268 | )
269 |
270 | model_metrics = ModelMetrics(
271 | model_statistics=MetricsSource(
272 | s3_uri="{}/evaluation_report.json".format(
273 | step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
274 | ),
275 | content_type="application/json",
276 | )
277 | )
278 |
279 | # Register model step that will be conditionally executed
280 | step_register = RegisterModel(
281 | name="DefectDetectionImageClassificationRegister",
282 | estimator=ic_estimator,
283 | model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
284 | content_types=["application/x-recordio"],
285 | response_types=["application/json"],
286 | inference_instances=["ml.c5.2xlarge", "ml.p3.2xlarge"],
287 | transform_instances=["ml.c5.xlarge"],
288 | model_package_group_name=model_package_group_name,
289 | model_metrics=model_metrics,
290 | approval_status=model_approval_status
291 | )
292 |
293 | # Condition step for evaluating model quality and branching execution
294 | cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here
295 | left=JsonGet(
296 | step_name=step_eval.name,
297 | property_file=evaluation_report,
298 | json_path="multiclass_classification_metrics.accuracy.value", # This should follow the structure of your report_dict defined in the evaluate.py file.
299 | ),
300 | right=0.8, # You can change the threshold here
301 | )
302 | step_cond = ConditionStep(
303 | name="DefectDetectionImageClassificationAccuracyCondition",
304 | conditions=[cond_lte],
305 | if_steps=[step_register],
306 | else_steps=[],
307 | )
308 |
309 | pipeline = Pipeline(
310 | name=pipeline_name,
311 | parameters=[
312 | processing_instance_type,
313 | processing_instance_count,
314 | target_image_size,
315 | augment_count_normal,
316 | augment_count_anomalous,
317 | training_instance_type,
318 | training_instance_count,
319 | training_num_training_samples,
320 | training_epochs,
321 | input_data,
322 | model_approval_status,
323 | model_package_group_name
324 | ],
325 | steps=[step_process, step_train, step_eval, step_cond],
326 | sagemaker_session=sagemaker_session,
327 | )
328 | return pipeline
--------------------------------------------------------------------------------
/src/cloud/semantic_segmentation_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Defect Detection: Semantic Segmentation - Pipeline Execution\n",
8 | "\n",
9 | "In this notebook, we will use the pipeline configured in the included python package under `pipelines` together with the defined code for preprocessing and training to automate the model training. It is easy to use such that you can simple drop in whatever input data for image classification you want and have it train a model automatically."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import boto3\n",
19 | "import sagemaker\n",
20 | "import time\n",
21 | "import uuid\n",
22 | "import json\n",
23 | "\n",
24 | "iot_client = boto3.client('iot')\n",
25 | "sts_client = boto3.client('sts')\n",
26 | "sm_client = boto3.client('sagemaker')\n",
27 | "\n",
28 | "# Get the account id\n",
29 | "account_id = sts_client.get_caller_identity()[\"Account\"]\n",
30 | "\n",
31 | "# Project Name as defined in your CloudFormation template\n",
32 | "PROJECT_NAME = ''\n",
33 | "\n",
34 | "region = boto3.Session().region_name\n",
35 | "role = sagemaker.get_execution_role()\n",
36 | "bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)\n",
37 | "\n",
38 | "# Change these to reflect your project/business name or if you want to separate ModelPackageGroup/Pipeline from the rest of your team\n",
39 | "model_package_group_name = 'defect-detection-semantic-segmentation-%s' % PROJECT_NAME\n",
40 | "job_prefix = 'defect-detection-semantic-segmentation'\n",
41 | "pipeline_name = 'defect-detection-semantic-segmentation-pipeline-%s' % PROJECT_NAME"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "### Getting the pipeline definition\n",
49 | "\n",
50 | "We use the `get_pipeline` method to create a pipeline DAG definition with our provided input. The input provided here is fixed for each pipeline you create or update, you cannot change these parameters with each execution (see usage of parameters in the cell below)."
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "from pipelines.semantic_segmentation.pipeline import get_pipeline\n",
60 | "\n",
61 | "pipeline = get_pipeline(\n",
62 | " region=region,\n",
63 | " role=role,\n",
64 | " default_bucket=bucket_name,\n",
65 | " pipeline_name=pipeline_name,\n",
66 | " base_job_prefix=job_prefix\n",
67 | ")"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### Creating the pipeline\n",
75 | "\n",
76 | "We create the pipeline (or update it in case it exists) with the previously defined DAG definition."
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "pipeline.upsert(role_arn=role)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "### Starting the pipeline execution\n",
93 | "\n",
94 | "We now start the exeuction of the pipeline with a given set of parameters which we can alter for every execution."
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "input_data_path = 's3://%s/' % bucket_name\n",
104 | "\n",
105 | "execution = pipeline.start(\n",
106 | " parameters=dict(\n",
107 | " InputData=input_data_path,\n",
108 | " TrainingInstanceType=\"ml.p3.2xlarge\",\n",
109 | " ModelApprovalStatus=\"Approved\",\n",
110 | " ModelPackageGroupName=model_package_group_name\n",
111 | " )\n",
112 | ")"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "### Check progress\n",
120 | "\n",
121 | "After execution started, you can always check the progress of your pipeline execution either by looking at the processing and training jobs in the SageMaker Console, using the built-in SageMaker Studio Pipeline visualization tools or using SDK methods like below."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "execution.describe()"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "## Preparing trained model for edge\n",
138 | "\n",
139 | "Please proceed here only, if the execution of the training pipeline as successful. In this part of the workshop, we will prepare the model which you just trained in the pipeline for the deployment onto the edge device."
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "compilation_output_sub_folder = 'models/' + job_prefix + '/compilation-output'\n",
149 | "edgepackaging_output_sub_folder = 'models/' + job_prefix + '/edge-packaging-output'\n",
150 | "\n",
151 | "# S3 Location to save the model artifact after compilation\n",
152 | "s3_compilation_output_location = 's3://{}/{}'.format(bucket_name, compilation_output_sub_folder)\n",
153 | "\n",
154 | "# S3 Location to save the model artifact after edge packaging\n",
155 | "s3_edgepackaging_output_location = 's3://{}/{}'.format(bucket_name, edgepackaging_output_sub_folder)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# Define some helper functions\n",
165 | "\n",
166 | "def get_latest_approved_s3_model_location(client, model_package_group):\n",
167 | " \"\"\"Returns the model location of the latest approved model version in a group\"\"\"\n",
168 | " response = client.list_model_packages(\n",
169 | " ModelPackageGroupName=model_package_group_name,\n",
170 | " ModelApprovalStatus='Approved'\n",
171 | " )\n",
172 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n",
173 | " model_artifact_location = sm_client.describe_model_package(ModelPackageName=latest_version['ModelPackageArn'])['InferenceSpecification']['Containers'][0]['ModelDataUrl']\n",
174 | " return model_artifact_location\n",
175 | "\n",
176 | "def get_latest_approved_model_version(client, model_package_group):\n",
177 | " \"\"\"Returns the model version of the latest approved model version in a group\"\"\"\n",
178 | " response = client.list_model_packages(\n",
179 | " ModelPackageGroupName=model_package_group_name,\n",
180 | " ModelApprovalStatus='Approved'\n",
181 | " )\n",
182 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n",
183 | " return latest_version['ModelPackageVersion']"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "### Run SageMaker Neo compilation job"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "# Retrieve some information on the model we just trained and registered in SageMaker Model Registry\n",
200 | "s3_model_artifact_location = get_latest_approved_s3_model_location(sm_client, model_package_group_name)\n",
201 | "print(s3_model_artifact_location)\n",
202 | "\n",
203 | "model_name = 'unet'\n",
204 | "compilation_job_name = '%s-%d' % (model_name, int(time.time()*1000))\n",
205 | "\n",
206 | "# Lets start a compilation job for the target architecture\n",
207 | "sm_client.create_compilation_job(\n",
208 | " CompilationJobName=compilation_job_name,\n",
209 | " RoleArn=role,\n",
210 | " InputConfig={\n",
211 | " 'S3Uri': s3_model_artifact_location,\n",
212 | " 'DataInputConfig': '{\"input_image\":[1,%d,%d,%d]}' % (3,224, 224),\n",
213 | " 'Framework': 'KERAS'\n",
214 | " },\n",
215 | " OutputConfig={\n",
216 | " 'S3OutputLocation': s3_compilation_output_location,\n",
217 | " 'TargetPlatform': { 'Os': 'LINUX', 'Arch': 'X86_64' }\n",
218 | " #'TargetPlatform': { 'Os': 'LINUX', 'Arch': 'ARM64', 'Accelerator': 'NVIDIA' },\n",
219 | " #'CompilerOptions': '{\"trt-ver\": \"7.1.3\", \"cuda-ver\": \"10.2\", \"gpu-code\": \"sm_53\"}'\n",
220 | " #'TargetPlatform': { 'Os': 'LINUX', 'Arch': 'ARM64'},\n",
221 | " #'TargetDevice': 'ml_c5'\n",
222 | " },\n",
223 | " StoppingCondition={ 'MaxRuntimeInSeconds': 900 }\n",
224 | ")\n",
225 | "\n",
226 | "# Poll the status of the job\n",
227 | "print('Started compilation job .', end='')\n",
228 | "while True:\n",
229 | " resp = sm_client.describe_compilation_job(CompilationJobName=compilation_job_name)\n",
230 | " if resp['CompilationJobStatus'] in ['STARTING', 'INPROGRESS']:\n",
231 | " print('.', end='')\n",
232 | " else:\n",
233 | " print(resp['CompilationJobStatus'], compilation_job_name)\n",
234 | " break\n",
235 | " time.sleep(5)\n",
236 | " \n",
237 | "if resp['CompilationJobStatus'] == 'COMPLETED':\n",
238 | " s3_compiled_model_artifact_location_fullpath = resp['ModelArtifacts']['S3ModelArtifacts']\n",
239 | " print(f'Compiled artifact location in S3: {s3_compiled_model_artifact_location_fullpath}')"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "### Running the SageMaker Edge Packaging job"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": [
255 | "# Run the edge packaging job\n",
256 | "edge_packaging_job_name='%s-%d' % (model_name, int(time.time()*1000))\n",
257 | "model_version=str(get_latest_approved_model_version(sm_client, model_package_group_name))\n",
258 | "\n",
259 | "# Start the edge packaging job\n",
260 | "resp = sm_client.create_edge_packaging_job(\n",
261 | " EdgePackagingJobName=edge_packaging_job_name,\n",
262 | " CompilationJobName=compilation_job_name,\n",
263 | " ModelName=model_name,\n",
264 | " ModelVersion=model_version,\n",
265 | " RoleArn=role,\n",
266 | " OutputConfig={\n",
267 | " 'S3OutputLocation': s3_edgepackaging_output_location\n",
268 | " }\n",
269 | ")\n",
270 | "\n",
271 | "# Poll the status of the job\n",
272 | "print('Started edge packaging job .', end='')\n",
273 | "while True:\n",
274 | " resp = sm_client.describe_edge_packaging_job(EdgePackagingJobName=edge_packaging_job_name)\n",
275 | " if resp['EdgePackagingJobStatus'] in ['STARTING', 'INPROGRESS']:\n",
276 | " print('.', end='')\n",
277 | " else:\n",
278 | " print(resp['EdgePackagingJobStatus'], compilation_job_name)\n",
279 | " break\n",
280 | " time.sleep(5)\n",
281 | " \n",
282 | "if resp['EdgePackagingJobStatus'] == 'COMPLETED':\n",
283 | " s3_packaged_model_artifact_location_fullpath = resp['ModelArtifact']\n",
284 | " print(f'Packaged artifact location in S3: {s3_packaged_model_artifact_location_fullpath}')"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "### Running IoT Job for deplyoment onto the edge"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "def split_s3_path(s3_path):\n",
301 | " path_parts=s3_path.replace(\"s3://\",\"\").split(\"/\")\n",
302 | " bucket=path_parts.pop(0)\n",
303 | " key=\"/\".join(path_parts)\n",
304 | " return bucket, key\n",
305 | "\n",
306 | "model_bucket, model_key = split_s3_path(s3_packaged_model_artifact_location_fullpath)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "resp = iot_client.create_job(\n",
316 | " jobId=str(uuid.uuid4()),\n",
317 | " targets=[\n",
318 | " 'arn:aws:iot:%s:%s:thinggroup/defect-detection-%s-group' % (region, account_id, PROJECT_NAME), \n",
319 | " ],\n",
320 | " document=json.dumps({\n",
321 | " 'type': 'new_model',\n",
322 | " 'model_version': model_version,\n",
323 | " 'model_name': model_name,\n",
324 | " 'model_package_bucket': model_bucket,\n",
325 | " 'model_package_key': model_key\n",
326 | " }),\n",
327 | " targetSelection='SNAPSHOT'\n",
328 | ")"
329 | ]
330 | }
331 | ],
332 | "metadata": {
333 | "instance_type": "ml.t3.medium",
334 | "interpreter": {
335 | "hash": "dca0ade3e726a953b501b15e8e990130d2b7799f14cfd9f4271676035ebe5511"
336 | },
337 | "kernelspec": {
338 | "display_name": "Python 3.7.6 64-bit ('base': conda)",
339 | "name": "python3"
340 | },
341 | "language_info": {
342 | "name": "python",
343 | "version": ""
344 | }
345 | },
346 | "nbformat": 4,
347 | "nbformat_minor": 4
348 | }
--------------------------------------------------------------------------------
/src/cloud/image_classification_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "source": [
6 | "# Defect Detection: Image Classification - Pipeline Execution\n",
7 | "\n",
8 | "In this notebook, we will use the pipeline configured in the included python package under `pipelines` together with the defined code for preprocessing and training to automate the model training. It is easy to use such that you can simple drop in whatever input data for image classification you want and have it train a model automatically.\n",
9 | "\n",
10 | "### Expected data format\n",
11 | "\n",
12 | "The expected data format for image classification is .png or .jpg images sorted into a \"normal\" or \"anomalous\" prefix in S3. Thus, the `InputData` parameter of the pipeline needs to point to an S3 prefix which contains \"folders\" (S3 prefixes\") named \"normal\" and \"anomalous\". These paths will be used by the preprocessing script to create a RecordIO training data set."
13 | ],
14 | "metadata": {}
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "source": [
20 | "import boto3\n",
21 | "import sagemaker\n",
22 | "import time\n",
23 | "import uuid\n",
24 | "import json\n",
25 | "\n",
26 | "iot_client = boto3.client('iot')\n",
27 | "sts_client = boto3.client('sts')\n",
28 | "sm_client = boto3.client('sagemaker')\n",
29 | "\n",
30 | "# Get the account id\n",
31 | "account_id = sts_client.get_caller_identity()[\"Account\"]\n",
32 | "\n",
33 | "# Project Name as defined in your CloudFormation template\n",
34 | "PROJECT_NAME = ''\n",
35 | "\n",
36 | "region = boto3.Session().region_name\n",
37 | "role = sagemaker.get_execution_role()\n",
38 | "bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)\n",
39 | "\n",
40 | "# Change these to reflect your project/business name or if you want to separate ModelPackageGroup/Pipeline from the rest of your team\n",
41 | "model_package_group_name = 'defect-detection-img-classification-%s' % PROJECT_NAME\n",
42 | "job_prefix = 'defect-detection-img-classification'\n",
43 | "pipeline_name = 'defect-detection-img-clf-pipeline-%s' % PROJECT_NAME"
44 | ],
45 | "outputs": [],
46 | "metadata": {}
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "source": [
51 | "### Getting the pipeline definition\n",
52 | "\n",
53 | "We use the `get_pipeline` method to create a pipeline DAG definition with our provided input. The input provided here is fixed for each pipeline you create or update, you cannot change these parameters with each execution (see usage of parameters in the cell below)."
54 | ],
55 | "metadata": {}
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "source": [
61 | "from pipelines.image_classification.pipeline import get_pipeline\n",
62 | "\n",
63 | "pipeline = get_pipeline(\n",
64 | " region=region,\n",
65 | " role=role,\n",
66 | " default_bucket=bucket_name,\n",
67 | " pipeline_name=pipeline_name,\n",
68 | " base_job_prefix=job_prefix\n",
69 | ")"
70 | ],
71 | "outputs": [],
72 | "metadata": {}
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "source": [
77 | "### Creating the pipeline\n",
78 | "\n",
79 | "We create the pipeline (or update it in case it exists) with the previously defined DAG definition."
80 | ],
81 | "metadata": {}
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "source": [
87 | "pipeline.upsert(role_arn=role)"
88 | ],
89 | "outputs": [],
90 | "metadata": {}
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "source": [
95 | "### Starting the pipeline execution\n",
96 | "\n",
97 | "We now start the exeuction of the pipeline with a given set of parameters which we can alter for every execution."
98 | ],
99 | "metadata": {}
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "source": [
105 | "input_data_path = 's3://%s/' % bucket_name\n",
106 | "\n",
107 | "execution = pipeline.start(\n",
108 | " parameters=dict(\n",
109 | " InputData=input_data_path,\n",
110 | " TrainingInstanceType=\"ml.p3.2xlarge\",\n",
111 | " ModelApprovalStatus=\"Approved\",\n",
112 | " ModelPackageGroupName=model_package_group_name,\n",
113 | " TargetImageSize=\"224\",\n",
114 | " AugmentCountAnomalous=\"1000\"\n",
115 | " )\n",
116 | ")"
117 | ],
118 | "outputs": [],
119 | "metadata": {}
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "source": [
124 | "### Check progress\n",
125 | "\n",
126 | "After execution started, you can always check the progress of your pipeline execution either by looking at the processing and training jobs in the SageMaker Console, using the built-in SageMaker Studio Pipeline visualization tools or using SDK methods like below."
127 | ],
128 | "metadata": {}
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "source": [
134 | "execution.describe()"
135 | ],
136 | "outputs": [],
137 | "metadata": {}
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "source": [
142 | "## Preparing trained model for edge\n",
143 | "\n",
144 | "Please proceed here only, if the execution of the training pipeline as successful. In this part of the workshop, we will prepare the model which you just trained in the pipeline for the deployment onto the edge device."
145 | ],
146 | "metadata": {}
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "source": [
152 | "compilation_output_sub_folder = 'models/' + job_prefix + '/compilation-output'\n",
153 | "edgepackaging_output_sub_folder = 'models/' + job_prefix + '/edge-packaging-output'\n",
154 | "\n",
155 | "# S3 Location to save the model artifact after compilation\n",
156 | "s3_compilation_output_location = 's3://{}/{}'.format(bucket_name, compilation_output_sub_folder)\n",
157 | "\n",
158 | "# S3 Location to save the model artifact after edge packaging\n",
159 | "s3_edgepackaging_output_location = 's3://{}/{}'.format(bucket_name, edgepackaging_output_sub_folder)"
160 | ],
161 | "outputs": [],
162 | "metadata": {}
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "source": [
168 | "# Define some helper functions\n",
169 | "\n",
170 | "def get_latest_approved_s3_model_location(client, model_package_group):\n",
171 | " \"\"\"Returns the model location of the latest approved model version in a group\"\"\"\n",
172 | " response = client.list_model_packages(\n",
173 | " ModelPackageGroupName=model_package_group_name,\n",
174 | " ModelApprovalStatus='Approved'\n",
175 | " )\n",
176 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n",
177 | " model_artifact_location = sm_client.describe_model_package(ModelPackageName=latest_version['ModelPackageArn'])['InferenceSpecification']['Containers'][0]['ModelDataUrl']\n",
178 | " return model_artifact_location\n",
179 | "\n",
180 | "def get_latest_approved_model_version(client, model_package_group):\n",
181 | " \"\"\"Returns the model version of the latest approved model version in a group\"\"\"\n",
182 | " response = client.list_model_packages(\n",
183 | " ModelPackageGroupName=model_package_group_name,\n",
184 | " ModelApprovalStatus='Approved'\n",
185 | " )\n",
186 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n",
187 | " return latest_version['ModelPackageVersion']"
188 | ],
189 | "outputs": [],
190 | "metadata": {}
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "source": [
195 | "### Run SageMaker Neo compilation job"
196 | ],
197 | "metadata": {}
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "source": [
203 | "# Retrieve some information on the model we just trained and registered in SageMaker Model Registry\n",
204 | "s3_model_artifact_location = get_latest_approved_s3_model_location(sm_client, model_package_group_name)\n",
205 | "print(s3_model_artifact_location)\n",
206 | "\n",
207 | "model_name = 'img-classification'\n",
208 | "compilation_job_name = '%s-%d' % (model_name, int(time.time()*1000))\n",
209 | "\n",
210 | "# Lets start a compilation job for the target architecture\n",
211 | "sm_client.create_compilation_job(\n",
212 | " CompilationJobName=compilation_job_name,\n",
213 | " RoleArn=role,\n",
214 | " InputConfig={\n",
215 | " 'S3Uri': s3_model_artifact_location,\n",
216 | " 'DataInputConfig': '{\"data\": [1,3,224,224]}',\n",
217 | " 'Framework': 'MXNET'\n",
218 | " },\n",
219 | " OutputConfig={\n",
220 | " 'S3OutputLocation': s3_compilation_output_location,\n",
221 | " 'TargetPlatform': {'Os': 'LINUX', 'Arch': 'X86_64'}\n",
222 | " },\n",
223 | " StoppingCondition={ 'MaxRuntimeInSeconds': 900 }\n",
224 | ")\n",
225 | "\n",
226 | "# Poll the status of the job\n",
227 | "print('Started compilation job .', end='')\n",
228 | "while True:\n",
229 | " resp = sm_client.describe_compilation_job(CompilationJobName=compilation_job_name)\n",
230 | " if resp['CompilationJobStatus'] in ['STARTING', 'INPROGRESS']:\n",
231 | " print('.', end='')\n",
232 | " else:\n",
233 | " print(resp['CompilationJobStatus'], compilation_job_name)\n",
234 | " break\n",
235 | " time.sleep(5)\n",
236 | " \n",
237 | "if resp['CompilationJobStatus'] == 'COMPLETED':\n",
238 | " s3_compiled_model_artifact_location_fullpath = resp['ModelArtifacts']['S3ModelArtifacts']\n",
239 | " print(f'Compiled artifact location in S3: {s3_compiled_model_artifact_location_fullpath}')"
240 | ],
241 | "outputs": [],
242 | "metadata": {}
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "source": [
247 | "### Running the SageMaker Edge Packaging job"
248 | ],
249 | "metadata": {}
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "source": [
255 | "# Run the edge packaging job\n",
256 | "edge_packaging_job_name='%s-%d' % (model_name, int(time.time()*1000))\n",
257 | "model_version=str(get_latest_approved_model_version(sm_client, model_package_group_name))\n",
258 | "\n",
259 | "# Start the edge packaging job\n",
260 | "resp = sm_client.create_edge_packaging_job(\n",
261 | " EdgePackagingJobName=edge_packaging_job_name,\n",
262 | " CompilationJobName=compilation_job_name,\n",
263 | " ModelName=model_name,\n",
264 | " ModelVersion=model_version,\n",
265 | " RoleArn=role,\n",
266 | " OutputConfig={\n",
267 | " 'S3OutputLocation': s3_edgepackaging_output_location\n",
268 | " }\n",
269 | ")\n",
270 | "\n",
271 | "# Poll the status of the job\n",
272 | "print('Started edge packaging job .', end='')\n",
273 | "while True:\n",
274 | " resp = sm_client.describe_edge_packaging_job(EdgePackagingJobName=edge_packaging_job_name)\n",
275 | " if resp['EdgePackagingJobStatus'] in ['STARTING', 'INPROGRESS']:\n",
276 | " print('.', end='')\n",
277 | " else:\n",
278 | " print(resp['EdgePackagingJobStatus'], compilation_job_name)\n",
279 | " break\n",
280 | " time.sleep(5)\n",
281 | " \n",
282 | "if resp['EdgePackagingJobStatus'] == 'COMPLETED':\n",
283 | " s3_packaged_model_artifact_location_fullpath = resp['ModelArtifact']\n",
284 | " print(f'Packaged artifact location in S3: {s3_packaged_model_artifact_location_fullpath}')"
285 | ],
286 | "outputs": [],
287 | "metadata": {}
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "source": [
292 | "### Running IoT Job for deplyoment onto the edge"
293 | ],
294 | "metadata": {}
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "source": [
300 | "def split_s3_path(s3_path):\n",
301 | " path_parts=s3_path.replace(\"s3://\",\"\").split(\"/\")\n",
302 | " bucket=path_parts.pop(0)\n",
303 | " key=\"/\".join(path_parts)\n",
304 | " return bucket, key\n",
305 | "\n",
306 | "model_bucket, model_key = split_s3_path(s3_packaged_model_artifact_location_fullpath)"
307 | ],
308 | "outputs": [],
309 | "metadata": {}
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "source": [
315 | "resp = iot_client.create_job(\n",
316 | " jobId=str(uuid.uuid4()),\n",
317 | " targets=[\n",
318 | " 'arn:aws:iot:%s:%s:thinggroup/defect-detection-%s-group' % (region, account_id, PROJECT_NAME), \n",
319 | " ],\n",
320 | " document=json.dumps({\n",
321 | " 'type': 'new_model',\n",
322 | " 'model_version': model_version,\n",
323 | " 'model_name': model_name,\n",
324 | " 'model_package_bucket': model_bucket,\n",
325 | " 'model_package_key': model_key\n",
326 | " }),\n",
327 | " targetSelection='SNAPSHOT'\n",
328 | ")"
329 | ],
330 | "outputs": [],
331 | "metadata": {}
332 | }
333 | ],
334 | "metadata": {
335 | "instance_type": "ml.t3.medium",
336 | "interpreter": {
337 | "hash": "dca0ade3e726a953b501b15e8e990130d2b7799f14cfd9f4271676035ebe5511"
338 | },
339 | "kernelspec": {
340 | "display_name": "Python 3.7.6 64-bit ('base': conda)",
341 | "name": "python3"
342 | },
343 | "language_info": {
344 | "name": "python",
345 | "version": ""
346 | }
347 | },
348 | "nbformat": 4,
349 | "nbformat_minor": 4
350 | }
--------------------------------------------------------------------------------
/setup/lambda-custom-resource/prepare_dev_package_cr.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | """
4 | Lambda-backed custom resource function to create the SageMaker Edge Manager device package.
5 | Support SageMaker Edge Agent Version:
6 | """
7 | import json
8 | import os
9 | import logging
10 | import stat
11 | from botocore.parsers import LOG
12 | import urllib3
13 | import boto3
14 | import tarfile
15 | import io
16 | from botocore.exceptions import ClientError
17 |
18 | http = urllib3.PoolManager()
19 |
20 | LOGGER = logging.getLogger()
21 | LOGGER.setLevel(logging.INFO)
22 |
23 | BUCKET_NAME = os.environ['BUCKET_NAME']
24 | PROJECT_NAME = os.environ['PROJECT_NAME']
25 | AWS_REGION = os.environ['AWS_REGION']
26 |
27 | LOCAL_DIR_PREFIX = '/tmp/' # Needed for running in AWS Lambda
28 |
29 | iot_client = boto3.client('iot')
30 | sm_client = boto3.client('sagemaker')
31 | s3_client = boto3.client('s3')
32 |
33 | # Global variables
34 | # This information needs to match with what was defined in the CloudFormation template
35 | sm_edge_device_name = 'edge-device-defect-detection-%s' % PROJECT_NAME
36 | iot_policy_name = 'defect-detection-policy-%s' % PROJECT_NAME
37 | iot_thing_name = 'edge-device-%s' % PROJECT_NAME
38 | iot_thing_group_name='defect-detection-%s-group' % PROJECT_NAME
39 | sm_em_fleet_name = 'defect-detection-%s' % PROJECT_NAME
40 | role_alias = 'SageMakerEdge-%s' % sm_em_fleet_name
41 |
42 |
43 | def cfn_cleanup():
44 | """Clean up resources created in the custom resources"""
45 |
46 | LOGGER.info('Deleting role alias if exists')
47 | try:
48 | iot_client.delete_role_alias(roleAlias=role_alias)
49 | except:
50 | LOGGER.info('Role alias deletion failed, continuing anyways')
51 |
52 | LOGGER.info('Deregistering device from edge fleet if exists')
53 | try:
54 | sm_client.deregister_devices(
55 | DeviceFleetName=sm_em_fleet_name,
56 | DeviceNames=[sm_edge_device_name]
57 | )
58 | except:
59 | LOGGER.info('Device deregistration failed, continuing anyways')
60 |
61 | LOGGER.info('Detaching certificates')
62 | try:
63 | cert_arn = iot_client.list_thing_principals(thingName=iot_thing_name)['principals'][0]
64 | cert_id = cert_arn.split('/')[-1]
65 | iot_client.detach_policy(policyName=iot_policy_name, target=cert_arn)
66 | iot_client.detach_thing_principal(thingName=iot_thing_name, principal=cert_arn)
67 | iot_client.update_certificate(certificateId=cert_id, newStatus='INACTIVE')
68 | iot_client.delete_certificate(certificateId=cert_id, forceDelete=True)
69 | iot_client.delete_thing_group(thingGroupName=iot_thing_group_name)
70 | except:
71 | LOGGER.info('Detaching certificates failed, continuing anyways')
72 |
73 |
74 |
75 |
76 | def lambda_handler(event, context):
77 | '''Handle Lambda event from AWS'''
78 |
79 | try:
80 | LOGGER.info('REQUEST RECEIVED:\n %s', event)
81 | LOGGER.info('REQUEST RECEIVED:\n %s', context)
82 | if event['RequestType'] == 'Create':
83 | LOGGER.info('CREATE!')
84 |
85 | LOGGER.info('Starting device packaging...')
86 | try:
87 | prepare_device_package(event, context)
88 | send_response(event, context, "SUCCESS",
89 | {"Message": "Resource creation successful!"})
90 | except Exception as e:
91 | send_response(event, context, "FAILED", {"Message": "Resource creation failed during device packaging!", "Error": str(e)})
92 | elif event['RequestType'] == 'Update':
93 | LOGGER.info('UPDATE!')
94 | send_response(event, context, "SUCCESS",
95 | {"Message": "Resource update successful!"})
96 | elif event['RequestType'] == 'Delete':
97 | LOGGER.info('DELETE!')
98 | # Start cleanup method
99 | cfn_cleanup()
100 | send_response(event, context, "SUCCESS",
101 | {"Message": "Resource deletion successful!"})
102 | else:
103 | LOGGER.info('FAILED!')
104 | send_response(event, context, "FAILED",
105 | {"Message": "Unexpected event received from CloudFormation"})
106 | except: #pylint: disable=W0702
107 | LOGGER.info('FAILED!')
108 | send_response(event, context, "FAILED", {
109 | "Message": "Exception during processing"})
110 |
111 |
112 | def send_response(event, context, response_status, response_data):
113 | '''Send a resource manipulation status response to CloudFormation'''
114 | response_body = json.dumps({
115 | "Status": response_status,
116 | "Reason": "See the details in CloudWatch Log Stream: " + context.log_stream_name,
117 | "PhysicalResourceId": context.log_stream_name,
118 | "StackId": event['StackId'],
119 | "RequestId": event['RequestId'],
120 | "LogicalResourceId": event['LogicalResourceId'],
121 | "Data": response_data
122 | })
123 |
124 | print("Response body:")
125 | print(response_body)
126 |
127 | response_url = event['ResponseURL']
128 |
129 | headers = {
130 | 'content-type' : '',
131 | 'content-length' : str(len(response_body))
132 | }
133 |
134 | try:
135 | response = http.request('PUT', response_url, headers=headers, body=response_body)
136 | print("Status code:", response.status)
137 |
138 | except Exception as e:
139 |
140 | print("send(..) failed executing http.request(..):", e)
141 |
142 |
143 | def setup_agent(thing_group_name, thing_group_arn):
144 | """Creates configuration file and sets up SageMaker Edge Agent for deployment
145 | onto a Amazon S3 bucket. Registers a device with a device fleet, creates IoT
146 | certificates and attaches them to the previously created IoT thing. Saves
147 | certificates onto local disk to make it ready for uploading to S3.
148 |
149 | Args:
150 | thing_group_name (string): a name for the IoT thing group
151 | thing_group_arn (string): the ARN of the IoT thing group
152 | """
153 |
154 | local_base_path = LOCAL_DIR_PREFIX + "agent/certificates/iot/edge_device_cert_%s.pem"
155 | relative_base_path = "agent/certificates/iot/edge_device_cert_%s.pem"
156 | thing_arn_template = thing_group_arn.replace('thinggroup', 'thing').replace(thing_group_name, '%s')
157 | cred_host = iot_client.describe_endpoint(endpointType='iot:CredentialProvider')['endpointAddress']
158 |
159 | # Check length of device name string
160 | if len(sm_edge_device_name) > 64:
161 | LOGGER.error("Device name for edge device is too long. Needs to be <64 characters.")
162 | raise ClientError('Device name for edge device is longer than 64 characters. Please choose a shorter value for ProjectName.')
163 |
164 | # register the device in the fleet
165 | # the device name needs to have 36 chars
166 | dev = [{'DeviceName': sm_edge_device_name, 'IotThingName': iot_thing_name}]
167 | try:
168 | sm_client.describe_device(DeviceFleetName=sm_em_fleet_name, DeviceName=sm_edge_device_name)
169 | LOGGER.info("Device was already registered on SageMaker Edge Manager")
170 | except ClientError as e:
171 | if e.response['Error']['Code'] != 'ValidationException': raise e
172 | LOGGER.info("Registering a new device %s on fleet %s" % (sm_edge_device_name, sm_em_fleet_name))
173 | sm_client.register_devices(DeviceFleetName=sm_em_fleet_name, Devices=dev)
174 | iot_client.add_thing_to_thing_group(
175 | thingGroupName=thing_group_name,
176 | thingGroupArn=thing_group_arn,
177 | thingName=iot_thing_name,
178 | thingArn=thing_arn_template % iot_thing_name
179 | )
180 |
181 | # if you reach this point you need to create new certificates
182 | # generate the certificates
183 | cert = local_base_path % ('cert')
184 | key = local_base_path % ('pub')
185 | pub = local_base_path % ('key')
186 |
187 | # Relative paths needed for setting path in config file
188 | cert_relative = relative_base_path % ('cert')
189 | key_relative = relative_base_path % ('pub')
190 | pub_relative = relative_base_path % ('key')
191 |
192 | cert_meta=iot_client.create_keys_and_certificate(setAsActive=True)
193 | cert_arn = cert_meta['certificateArn']
194 | with open(cert, 'w') as c: c.write(cert_meta['certificatePem'])
195 | with open(key, 'w') as c: c.write(cert_meta['keyPair']['PrivateKey'])
196 | with open(pub, 'w') as c: c.write(cert_meta['keyPair']['PublicKey'])
197 |
198 | # attach the certificates to the policy and to the thing
199 | iot_client.attach_policy(policyName=iot_policy_name, target=cert_arn)
200 | iot_client.attach_thing_principal(thingName=iot_thing_name, principal=cert_arn)
201 |
202 | LOGGER.info("Creating agent config JSON file")
203 |
204 | # Please note that the $WORKDIR variables need to be replaced by the absolute path of the working directory of your project.
205 | # If you follow the guide, the install script will automatically replace those.
206 | agent_params = {
207 | "sagemaker_edge_core_device_name": sm_edge_device_name,
208 | "sagemaker_edge_core_device_fleet_name": sm_em_fleet_name,
209 | "sagemaker_edge_core_region": AWS_REGION,
210 | "sagemaker_edge_provider_provider": "Aws",
211 | "sagemaker_edge_provider_provider_path" : "$WORKDIR/agent/lib/libprovider_aws.so",
212 | "sagemaker_edge_core_root_certs_path": "$WORKDIR/agent/certificates/root",
213 | "sagemaker_edge_provider_aws_ca_cert_file": "$WORKDIR/agent/certificates/iot/AmazonRootCA1.pem",
214 | "sagemaker_edge_provider_aws_cert_file": "$WORKDIR/%s" % cert_relative,
215 | "sagemaker_edge_provider_aws_cert_pk_file": "$WORKDIR/%s" % key_relative,
216 | "sagemaker_edge_provider_aws_iot_cred_endpoint": "https://%s/role-aliases/%s/credentials" % (cred_host,role_alias),
217 | "sagemaker_edge_core_capture_data_destination": "Cloud",
218 | "sagemaker_edge_provider_s3_bucket_name": BUCKET_NAME,
219 | "sagemaker_edge_core_folder_prefix": "edge-agent-inference-data-capture",
220 | "sagemaker_edge_core_capture_data_buffer_size": 30,
221 | "sagemaker_edge_core_capture_data_batch_size": 10,
222 | "sagemaker_edge_core_capture_data_push_period_seconds": 10,
223 | "sagemaker_edge_core_capture_data_base64_embed_limit": 2,
224 | "sagemaker_edge_log_verbose": False
225 | }
226 | with open(LOCAL_DIR_PREFIX + 'agent/conf/config_edge_device.json', 'w') as conf:
227 | conf.write(json.dumps(agent_params, indent=4))
228 |
229 |
230 | def prepare_device_package(event, context):
231 | """Prepares the edge device package in a lambda function and uploads it to the S3 bucket"""
232 |
233 | # create a new thing group
234 | thing_group_arn = None
235 | agent_pkg_bucket = 'sagemaker-edge-release-store-us-west-2-linux-x64'
236 | agent_config_package_prefix = 'edge-device-configuration/agent/config.tgz'
237 |
238 | # check if edge agent package has already been built
239 | try:
240 | s3_client.download_file(Bucket=BUCKET_NAME, Key=agent_config_package_prefix, Filename='/tmp/dump')
241 | LOGGER.info('The agent configuration package was already built! Skipping...')
242 | quit()
243 | except ClientError as e:
244 | pass
245 |
246 | # Create a new thing group if not found yet
247 | try:
248 | thing_group_arn = iot_client.describe_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn']
249 | LOGGER.info("Thing group found")
250 | except iot_client.exceptions.ResourceNotFoundException as e:
251 | LOGGER.info("Creating a new thing group")
252 | thing_group_arn = iot_client.create_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn']
253 |
254 | LOGGER.info("Creating the directory structure for the agent")
255 | # create a structure for the agent files
256 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/root', exist_ok=True)
257 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/iot', exist_ok=True)
258 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/logs', exist_ok=True)
259 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/model', exist_ok=True)
260 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/conf', exist_ok=True)
261 |
262 | LOGGER.info("Downloading root certificate and agent binary")
263 | # then get some root certificates
264 | resp = http.request('GET', 'https://www.amazontrust.com/repository/AmazonRootCA1.pem')
265 | with open(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', 'w') as c:
266 | c.write(resp.data.decode('utf-8'))
267 |
268 | # this certificate validates the edge manage package
269 | s3_client.download_file(
270 | Bucket=agent_pkg_bucket,
271 | Key='Certificates/%s/%s.pem' % (AWS_REGION, AWS_REGION),
272 | Filename=LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION
273 | )
274 |
275 | LOGGER.info("Adjusting file permissions of pem files")
276 | # adjust the permissions of the files
277 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', stat.S_IRUSR|stat.S_IRGRP)
278 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION, stat.S_IRUSR|stat.S_IRGRP)
279 |
280 | LOGGER.info("Processing the agent...")
281 | setup_agent(iot_thing_group_name, thing_group_arn )
282 |
283 | LOGGER.info("Creating the final package...")
284 | with io.BytesIO() as f:
285 | with tarfile.open(fileobj=f, mode='w:gz') as tar:
286 | tar.add(LOCAL_DIR_PREFIX + 'agent', 'agent', recursive=True)
287 | f.seek(0)
288 | LOGGER.info("Uploading to S3")
289 | s3_client.upload_fileobj(f, Bucket=BUCKET_NAME, Key=agent_config_package_prefix)
290 | LOGGER.info("Done!")
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Defect detection using computer vision at the edge with Amazon SageMaker
2 |
3 | This repository is related to our blog post [Detect industrial defects at low latency with computer vision at the edge with Amazon SageMaker Edge](https://aws.amazon.com/blogs/machine-learning/detect-industrial-defects-at-low-latency-with-computer-vision-at-the-edge-with-amazon-sagemaker-edge/) in the AWS Machine Learning blog.
4 |
5 | In this workshop, we will walk you through a step by step process to build and train computer vision models with Amazon SageMaker and package and deploy them to the edge with [SageMaker Edge Manager](https://aws.amazon.com/sagemaker/edge-manager/). The workshop focuses on a defect detection use case in an industrial setting with models like image classification, and semantic segmentation to detect defects across several object types. We will complete the MLOps lifecycle with continuous versioned over-the-air model updates and data capture to the cloud.
6 |
7 | > [!WARNING]
8 | > Please note that this sample is outdated. Since 26th of April 2024, SageMaker Edge Manager has been discontinued. Please refer to the respective [EOL documentation page](https://docs.aws.amazon.com/sagemaker/latest/dg/edge-eol.html) in order to learn about potential alternatives.
9 |
10 | ## Architecture
11 |
12 | The architecture we will build during this workshop is illustrated below. Several key components can be highlighted:
13 |
14 | 1. **Model development and training on the cloud**: This repository contains code for two pipelines based on [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html) for each of the two model types used (classification and segmentation). These pipelines will be built and executed in a SageMaker Studio notebook.
15 | 2. **Model deployment to the edge**: Once a model building pipeline executed successfully, models will be compiled with [SageMaker Neo](https://aws.amazon.com/sagemaker/neo/) and packaged with a [SageMaker Edge packaging job](https://docs.aws.amazon.com/sagemaker/latest/dg/edge-packaging-job.html). As such, they can be deployed onto the edge device via IoT Jobs. On the edge device an application is running which will receive the model deployment job payload via MQTT and download the relevant model package.
16 | 3. **Edge inference**: The edge device is running the actual application for defect detection. In this workshop, we will use an EC2 instance to simulate an edge device - but any hardware device (RaspberryPi, Nvidia Jetson) can be used as long as SageMaker Neo compilations are supported. During setup, a configuration package is being downloaded to edge device to configure SageMaker Edge Agent. The Edge Agent on the device can then load models deployed via OTA updates and make them available for prediction via a low-latency gRPC API (see [SageMaker Edge Manager documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/edge.html)).
17 |
18 | 
19 |
20 | ## Dataset
21 |
22 | This workshop is designed to be used with any dataset for defect detection that includes labels and masks. To be able to use both models (see section [Models](#models)), you will need a dataset of labelled images (*normal* and *anomalous*) as well as a set of respective *ground truth masks* which identify where the defect on a part is located. To train the models with the provided pipeline without any major code adjustments, you merely need to upload the dataset in the format together with correct path prefixes in an S3 bucket. Please refer to the [Getting Started](#getting-started) guide below on more details for model training with a dataset.
23 |
24 | However, for simplicity of this walkthrough, we will showcase the end-to-end solution using the [KolektorSDD2](https://www.vicos.si/resources/kolektorsdd2/) dataset for defect detection. This dataset consists of over 3000 images of surface defects together with respective binary masks which identify the location of those defects in the image. This makes this dataset very much suitable for our use case.
25 |
26 | Below you can find examples of those images and their masks as provided in the dataset. The image was taken from the [website](https://www.vicos.si/resources/kolektorsdd2/) of the creators of the KolektorSDD2 dataset (see also [Bozic et al., 2021] under [References](#references) ).
27 |
28 | 
29 |
30 | ## Models
31 |
32 | In this workshop, you will build two types of machine learning models:
33 |
34 | * an image classification model using the [built-in SageMaker Image Classification algorithm](https://docs.aws.amazon.com/sagemaker/latest/dg/image-classification.html) based on the [MXNet framework](https://mxnet.apache.org/versions/1.8.0/)
35 | * a semantic segmentation model built with [Tensorflow/Keras](https://github.com/tensorflow/tensorflow) using the [UNET deep learning architecture](https://arxiv.org/abs/1505.04597)
36 |
37 | ## Directory structure of this repository
38 |
39 | This repository has the following directory structure:
40 |
41 | ```
42 | ├── setup <-- contains the CloudFormation template for easy-to-use setup of AWS resources
43 | └── src <-- contains the actual source code for this project
44 | ├── cloud <-- contains the code for model training in the cloud and initiation of OTA deployments to the edge
45 | └── edge <-- contains the code that is running on the edge device
46 | ```
47 |
48 | ### Edge code directory structure
49 |
50 | ```
51 | src/edge
52 | ├── app <-- python module for this application
53 | │ ├── edgeagentclient.py <-- abstractions for calling edge agent gRPC APIs
54 | │ ├── logger.py <-- utilities for logging output to AWS IoT Core
55 | │ ├── ota.py <-- utilities for handling OTA IoT jobs
56 | │ └── util.py <-- additional utilities
57 | ├── install.py <-- install script for downloading and configuring edge agent
58 | ├── models_config.json <-- model configuration, also used for persisting model versions
59 | ├── run.py <-- runs the edge application
60 | ├── start_edge_agent.sh <-- starts the SM edge agent
61 | ├── static <-- contains static images for Flask app, download test images here
62 | └── templates <-- contains HTML Jinja templates for Flask app
63 | ```
64 |
65 | ### Cloud code directory structure
66 |
67 | ```
68 | src/cloud
69 | ├── image_classification_pipeline.ipynb <-- notebook for running the image classification pipeline
70 | ├── semantic_segmentation_pipeline.ipynb <-- notebook for running the semantic segmentation pipeline
71 | ├── data_preparation.ipynb <-- notebook for data preprocessing of the KolektorSDD2 dataset
72 | └── pipelines <-- model building code and pipeline definition
73 | ├── get_pipeline_definition.py <-- CLI tool for CICD
74 | ├── run_pipeline.py <-- CLI tool for CICD
75 | ├── image_classification <-- contains the pipeline code for image classification
76 | │ ├── evaluation.py <-- script to evaluate model performance on test dataset
77 | │ ├── pipeline.py <-- pipeline definition
78 | │ └── preprocessing.py <-- script for preprocessing (augmentation, train/test/val split)
79 | └── semantic_segmentation <-- contains the pipeline code for semantic segmentation
80 | ├── pipeline.py <-- pipeline definition
81 | ├── preprocessing.py <-- script for preprocessing (augmentation, train/test/val split)
82 | ├── requirements.txt <-- python dependencies needed for training
83 | └── train_tf.py <-- training script for training the unet model
84 |
85 | ```
86 |
87 | ## Walkthrough
88 |
89 | Please follow the steps below to start building your own edge ML project. You will create a CloudFormation stack to set up all necessary resources in the cloud and prepare an edge device for usage with SageMaker Edge Manager. You will then train models in the cloud and deployment to the edge device using AWS IoT. Please note that model training in the cloud and running inference on the edge are interdependent of each other. We recommend you start by setting up the edge device first and then train the models as a second step. This way, you can then directly deploy them to the edge after you have successfully trained the models.
90 |
91 | ### Setting up workshop resources by launching the CloudFormation stack
92 |
93 | 1. Launch a new CloudFormation stack with the provided template under `setup/template.yaml`. To learn about how to deploy CloudFormation stacks, please refer to the [documentation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-console-create-stack.html).
94 | 2. Define a name for the stack and enter a *Project Name* parameter, that is unique in your account. It must be compliant with Amazon S3 bucket names, so please choose a lowercase string here. The project name that you define during stack creation defines the name of many of the resources that are being created with the stack. Make sure to take note of this parameter.
95 | 3. Have a look at the CloudFormation stack outputs and take note of the provided information.
96 |
97 | #### What is being created by the CloudFormation stack?
98 |
99 | This stack configures several resources needed for this workshop. It sets up an IoT device together with certificates and roles, an Edge Manager fleet, registers the device with the fleet and creates a package for edge agent configuration which is being saved in the S3 bucket for this project. The following image illustrates the resources being created with the CloudFormation stack.
100 |
101 | 
102 |
103 | ### Configuring the edge device
104 |
105 | 1. Launch an EC2 instance with Ubuntu Server 20 with SSH access (e.g. via [Session Manager](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager.html)) into a public subnet and make sure it gets assigned a public IP (you will need this later to access the web application). Ensure that it has access to the S3 buckets containing your configuration package (find the bucket name in the CloudFormation output). It will also need access to the bucket containing the SageMaker Edge Agent binary. For more information, refer to the [SageMaker Edge Manager documentation pages](https://docs.aws.amazon.com/sagemaker/latest/dg/edge-device-fleet-about.html). This EC2 instance will from now be considered our "edge device".
106 | 2. Clone this GitHub repository onto the edge device or simply copy the `src/edge` directory onto the edge device.
107 | 3. Install the dependencies by running `sudo apt update -y && sudo apt install -y build-essential procps` and `pip install -r requirements.txt` to install the necessary python dependencies.
108 | 4. Run the installation script by running `python3 install.py --project-name --account-id `. This script will download the edge agent configuration package created during the CloudFormation deployment, download the edge agent binary and also generate the protobuf agent stubs. A newly created directory `./agent/` contains the files for the edge agent. The following image illustrated what happens in the installation script:
109 |
110 | 
111 |
112 | 5. Create an environment variable to define the location of the agent directory. If you haven't changed your current directory, this would likely be `export SM_EDGE_AGENT_HOME=$PWD/agent`.
113 | 6. Start the edge agent by running `./start_edge_agent.sh`, which launches the edge agent on the unix socket `tmp/edge_agent`. You should now the able to interact with the edge agent from your application.
114 | 7. Before running the actual application, you need to define an environment variable which determines whether you want to run the app with the Flask development server or the with a production-ready uWSGI server (using [waitress](https://github.com/Pylons/waitress)). For now, lets use the production server by setting `export SM_APP_ENV=prod`. For debugging, you might want to later change this to `dev`.
115 | 8. Run the application with `python3 run.py` to initialize the application, verify cloud connectivity, connect to the edge agent. This application is a [Flask](https://flask.palletsprojects.com/en/2.0.x/) web application running port port 8080 which is integrated with SageMaker Edge Agent and AWS IoT for OTA updates. You will see that, if you have no models deployed yet and have not downloaded any test images, nothing will happen yet in the application. It will stay idle until it can access test images in the `/static` folder and run inference on those with a deployed model. In the next step, we will see how we can run automated model training with SageMaker Pipelines and deploy them onto the edge device for local inference.
116 | 9. Go to the EC2 dashboard and find the public IP address of your instance. Browse the public IP address on port 8080, i.e. `http://:8080`. You should now see the web application in your browser window. Ensure that you allow ingress on port 8080 in the security group attached to your instance (see [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/working-with-security-groups.html#adding-security-group-rule) for details on how to set this up). Also, make sure your local firewalls on your device allow ingress through port 8080. Refer to the [Troubleshooting](#troubleshooting-and-faq) section for further tips.
117 |
118 | ### Automated model training in the cloud with SageMaker Pipelines
119 |
120 | 1. Create a SageMaker Studio domain and user by following [this](https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks.html) guide in the documentation. Make sure that the IAM role used has access to the S3 bucket created during the CloudFormation deployment.
121 | 2. Clone this repository or copy the `src/cloud` directory onto the SageMaker Studio domain.
122 | 3. Prepare your dataset for training. In case you choose to use the KolektorSDD2 dataset like in this example, you can use the provided notebook under `src/cloud/data_preparation.ipynb` to download the dataset and partition it into the subdirectories needed for the training pipeline. With the provided pipeline code you can train two model types (image classification and semantic segmentation). You might want to set aside some images to be used for local inference. Download those onto the edge device and save them into the `static` folder so they can be used for inference by the edge application. Please note that we integrated a step in the preprocessing scripts to add a padding around the images in order to make them square, as the KolektorSDD2 dataset contains non-square images. If your provided images are already square, this step will be skipped. Just ensure that you use the same preprocessing for inference later (i.e. add padding if padding was added for training).
123 | 4. To use the pipelines without any code modifications you need to use structure your datasets as follows:
124 | * **Image Classification**: Your dataset needs to be split into `normal` and `anomalous` directories according to their respective label. Upload the data to your S3 bucket (e.g. under `s3:///data/img-classification/`). Thus, your normal images will be located in `s3:///data/img-classification/normal` and the anomalous ones in `s3:///data/img-classification/anomalous`. Train / test / validation split will be done automatically in the preprocessing step of the pipeline.
125 | * **Semantic Segmentation**: Your dataset needs to be split into `images` and `masks` directories. Upload the data to your S3 bucket (e.g. under `s3:///data/semantic-segmentation/`). Thus, your images will be located in `s3:///data/semantic-segmentation/images` and the binary masks in `s3:///data/semantic-segmentation/masks`. Train / test / validation split will be done automatically in the preprocessing step of the pipeline.
126 | 5. Execute the training pipeline: you will find a Jupyter Notebook for each of the model types in `src/cloud/`. Please adjust the project name you used during the CloudFormation deployment in the notebook. Also, you need to provide the S3 input data path as a parameter of the pipeline. Please make sure this aligns with the S3 path you used for uploading the dataset in step 3. You can monitor the pipeline execution in your SageMaker Studio domain. In case it finishes successfully, it should look similar to the one displayed below.
127 |
128 | 
129 |
130 | ### Edge deployment and inference at edge
131 |
132 | 1. Once the pipeline finished successfully, your model is almost ready for use on the edge device. Verify that the latest model version in the model registry is approved to make it available for edge deployment.
133 | 2. Execute the following cells of the notebook to run model compilation with SageMaker Neo and then package the model for usage with SageMaker Edge Manager.
134 | 3. Finally, you can deploy the model package onto the edge by running the IoT Job as an Over-The-Air update. If your edge application is currently running, it should receive the OTA deployment job, download the model package and load it into the Edge Agent.
135 | 4. Verify that the deployment automation works by checking the log output on the edge device. You can also verify the successful deployment of a new model version by verifying the successful execution of the IoT job in the AWS IoT Core Console (under "Manage" --> "Jobs") as shown below.
136 |
137 | 
138 |
139 | #### Persisting model configuration
140 |
141 | You can set which models should be loaded initially by configuring the `model_config.json` file. The application will instruct the edge agent to load these models upon startup. You can update model versions by creating IoT jobs from the cloud. The OTA IoT client running alongside the application will listen to the job topics and download the model accordingly. Please also note that for each new model you deploy you might have to adjust your application code accordingly (e.g. if your input shape changes). The structure of the `model_config.json` file with a sample configuration is shown below.
142 |
143 | In `"mappings"`, you can define which model should be used for each of the two inferences in the application this name needs to align with the model name you choose during OTA deployment. In `"models"`, information about the models loaded into the edge agent are persisted even after you shutdown the application. Please note that this is automatically filled out by the application and saved before you close out of the application. You do not need to manually configure this. In case you want to use a manually deployed model package with this application, you can instruct the application to load this model by manually adding a model definition into the JSON file under `"models"`.
144 |
145 | ```json
146 | {
147 | "mappings": {
148 | "image-classification-app": "img-classification",
149 | "image-segmentation-app": "unet"
150 | },
151 | "models": [
152 | {
153 | "name": "img-classification",
154 | "version": "1",
155 | "identifier": "img-classification-1"
156 | }
157 | ]
158 | }
159 | ```
160 |
161 | #### Running inference on the edge device
162 |
163 | To run inference on the device, you need to have fulfilled the following requirements:
164 |
165 | * The edge agent on the edge device is properly configured and can successfully authenticate against AWS IoT
166 | * You have downloaded test images onto the edge device in the folder `static/`
167 | * You have deployed at least one of the two models (image classification or semantic segmentation) via OTA updates
168 | * The edge agent is running and the models could be loaded successfully (for troubleshooting check command line output or edge agent logs in `agent/logs/agent.log`)
169 |
170 | If everything is configured accordingly, you should see the edge application cycling through the provided images in the `static/` directory and run inference against both of the models. The result of the inference is then displayed in the web application. You can see a screenshot of the running web application below. The two models loaded into edge agent are displayed on the top, the incoming image from the camera stream is fed into the two models and the predictions are illustrated on the bottom of the page.
171 |
172 | 
173 |
174 | #### Continuously deploying new model versions to the edge
175 |
176 | You can now continuously retrain your model on new data or with new parameter configurations and deploy them onto the edge device by running again through steps 1-5 in [Automated model training in the cloud with Sagemaker Pipelines](#automated-model-training-in-the-cloud-with-sagemaker-pipelines). Your application on the edge device will automatically download the new model packages (if the version provided is higher than the one used currently). It then unloads old model version from the edge agent and loads the newer version once available. It persists its model configuration in the JSON file described in section 5 of [Automated model training in the cloud with Sagemaker Pipelines](#automated-model-training-in-the-cloud-with-sagemaker-pipelines).
177 |
178 | ### Productionizing the solution
179 |
180 | This workshop showcases a simple way of managing deployments of multiple CV models onto an edge device for defect detection use cases. For the sake of simplicity, we run certain steps in a manual fashion by e.g. preparing and deploying models from a Sagemaker Studio notebook. In a production setting, we recommend using dedicated pipelines both for the model building component as well as for the deployment component. Similar to the [MLOps reference architecture as outlined in the AWS blog](https://aws.amazon.com/blogs/apn/taming-machine-learning-on-aws-with-mlops-a-reference-architecture/), one would use Amazon EventBridge event rules to kick off the deployment process after a approval of a new version in the model registry has been detected. Likewise, the pipeline execution would be triggered by either a commit to a connected code repository or by other events that require retraining (e.g. detected model drift or new incoming data).
181 |
182 | ### Troubleshooting and FAQ
183 |
184 | * *The application running on EC2 is not accessible through via public IP address.*
185 | Make sure you opened up the port your application is running on in the security group attached to the instance. In case you cannot access the application through any other port than port 80, you could try to map the port 80 to 8080 by configuring a NAT redirect using the *iptables* command line tool as follows: `sudo iptables -t nat -A PREROUTING -p tcp --dport 80 -j REDIRECT --to-port 8080`
186 | * *The edge application fails due to errors related with SageMaker Edge Manager*
187 | You can try to restart the edge agent by killing the running process and starting edge agent again with the provided shell script. Make sure that `models_config.json` is configured such that desired models get loaded automatically upon application start. You can also check out the agent logs under `agent/logs` for troubleshooting.
188 |
189 | ## References
190 |
191 | * aws-samples GitHub Repository "ML@Edge with SageMaker Edge Manager"
192 | https://github.com/aws-samples/amazon-sagemaker-edge-manager-workshop
193 | * Ronneberger, O., Fischer, P., & Brox, T. (2015). U-Net: Convolutional Networks for Biomedical Image Segmentation. MICCAI. https://arxiv.org/abs/1505.04597
194 | * Bozic, J., Tabernik, D. & Skocaj, D. (2021). Mixed supervision for surface-defect detection: from weakly to fully supervised learning. Computers in Industry. https://arxiv.org/abs/2104.06064
195 |
196 | ## Security
197 |
198 | See [CONTRIBUTING](CONTRIBUTING.md) for more information.
199 |
200 | ## License
201 |
202 | This library is licensed under the MIT-0 License. See the [LICENSE](LICENSE) file.
203 |
--------------------------------------------------------------------------------
/setup/template.yaml:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: MIT-0
3 | AWSTemplateFormatVersion: '2010-09-09'
4 | Description: "This template will create the neccessary IoT resources for the SM Edge Manager Workshop"
5 | Parameters:
6 | ProjectName:
7 | Type: String
8 | Description: A name for this project. This value defines the naming of many of the hereby created resources.
9 |
10 | Resources:
11 | ProjectArtifactsBucket:
12 | Type: AWS::S3::Bucket
13 | DeletionPolicy: Delete
14 | Properties:
15 | BucketName: !Sub sm-edge-workshop-${ProjectName}-${AWS::AccountId}
16 | BucketEncryption:
17 | ServerSideEncryptionConfiguration:
18 | - ServerSideEncryptionByDefault:
19 | SSEAlgorithm: AES256
20 | VersioningConfiguration:
21 | Status: Enabled
22 |
23 | EdgeDeviceRole:
24 | Type: AWS::IAM::Role
25 | Properties:
26 | RoleName: !Sub EdgeDeviceRole-${ProjectName}
27 | AssumeRolePolicyDocument:
28 | Version: "2012-10-17"
29 | Statement:
30 | - Effect: Allow
31 | Principal:
32 | Service:
33 | - sagemaker.amazonaws.com
34 | - iot.amazonaws.com
35 | - credentials.iot.amazonaws.com
36 | Action:
37 | - 'sts:AssumeRole'
38 | Path: /
39 | Policies:
40 | - PolicyName: !Sub EdgeDeviceRolePolicy-${ProjectName}
41 | PolicyDocument:
42 | Version: "2012-10-17"
43 | Statement:
44 | - Effect: Allow
45 | Action:
46 | - 's3:GetObject'
47 | - 's3:PutObject'
48 | - 's3:ListBucket'
49 | - 's3:GetBucketLocation'
50 | Resource:
51 | - !GetAtt ProjectArtifactsBucket.Arn
52 | - !Join [ '/', [ !GetAtt ProjectArtifactsBucket.Arn, '*' ] ]
53 | - Effect: Allow
54 | Action:
55 | - 's3:ListAllMyBuckets'
56 | Resource:
57 | - '*'
58 | - Effect: Allow
59 | Action:
60 | - 'iot:CreateRoleAlias'
61 | - 'iot:DescribeRoleAlias'
62 | - 'iot:UpdateRoleAlias'
63 | - 'iot:TagResource'
64 | - 'iot:ListTagsForResource'
65 | Resource:
66 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:rolealias/SageMakerEdge*"
67 | - Effect: Allow
68 | Action:
69 | - 'iam:GetRole'
70 | - 'iam:PassRole'
71 | Resource:
72 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/*SageMaker*'
73 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/*Sagemaker*'
74 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/*sagemaker*'
75 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/EdgeDeviceRole-${ProjectName}'
76 | - Effect: Allow
77 | Action:
78 | - 'sagemaker:GetDeviceRegistration'
79 | - 'sagemaker:SendHeartbeat'
80 | - 'sagemaker:DescribeDevice'
81 | Resource: "*"
82 | - Effect: Allow
83 | Action:
84 | - 'iot:DescribeEndpoint'
85 | Resource:
86 | - '*'
87 |
88 | DefectDetectionIotPolicy:
89 | Type: AWS::IoT::Policy
90 | Properties:
91 | PolicyName: !Sub "defect-detection-policy-${ProjectName}"
92 | PolicyDocument:
93 | Version: '2012-10-17'
94 | Statement:
95 | - Effect: Allow
96 | Action:
97 | - iot:Connect
98 | Resource:
99 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:client/*"
100 | - Effect: Allow
101 | Action:
102 | - iot:Publish
103 | - iot:Receive
104 | Resource:
105 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topic/defect-detection/*"
106 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topic/$aws/*"
107 | - Effect: Allow
108 | Action:
109 | - iot:Subscribe
110 | Resource:
111 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topicfilter/defect-detection/*"
112 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topicfilter/$aws/*"
113 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topic/$aws/*"
114 | - Effect: Allow
115 | Action:
116 | - iot:UpdateThingShadow
117 | Resource:
118 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topicfilter/defect-detection/*"
119 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:thing/edge-device-*"
120 | - Effect: Allow
121 | Action:
122 | - iot:AssumeRoleWithCertificate
123 | Resource:
124 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:rolealias/SageMakerEdge-defect-detection-${ProjectName}"
125 |
126 | EdgeDeviceThing:
127 | Type: AWS::IoT::Thing
128 | Properties:
129 | ThingName: !Sub edge-device-${ProjectName}
130 |
131 | EdgeDeviceFleet:
132 | Type: AWS::SageMaker::DeviceFleet
133 | Properties:
134 | Description: String
135 | DeviceFleetName: !Sub defect-detection-${ProjectName}
136 | OutputConfig:
137 | S3OutputLocation: !Join [ '/', [ 's3:/', !Ref 'ProjectArtifactsBucket', 'data'] ]
138 | RoleArn: !GetAtt EdgeDeviceRole.Arn
139 |
140 | DefectDetectionModelPackageGroupImgClassification:
141 | Type: AWS::SageMaker::ModelPackageGroup
142 | Properties:
143 | ModelPackageGroupName: !Sub defect-detection-img-classification-${ProjectName}
144 | ModelPackageGroupDescription: A model package group for your image classification models
145 |
146 | DefectDetectionModelPackageGroupSemanticSegmentation:
147 | Type: AWS::SageMaker::ModelPackageGroup
148 | Properties:
149 | ModelPackageGroupName: !Sub defect-detection-semantic-segmentation-${ProjectName}
150 | ModelPackageGroupDescription: A model package group for your semantic segmentation models
151 |
152 | CustomResourceLambdaRole:
153 | Type: AWS::IAM::Role
154 | Properties:
155 | RoleName: !Sub CustomResourceLambdaRole-${ProjectName}
156 | AssumeRolePolicyDocument:
157 | Version: "2012-10-17"
158 | Statement:
159 | - Effect: Allow
160 | Principal:
161 | Service:
162 | - lambda.amazonaws.com
163 | Action:
164 | - 'sts:AssumeRole'
165 | Path: /
166 | Policies:
167 | - PolicyDocument:
168 | Version: "2012-10-17"
169 | Statement:
170 | - Effect: Allow
171 | Action:
172 | - 'iot:*'
173 | - 'sagemaker:*'
174 | Resource:
175 | - '*'
176 | - Effect: Allow
177 | Action:
178 | - 's3:*'
179 | Resource:
180 | - !GetAtt ProjectArtifactsBucket.Arn
181 | - !Join [ '', [ !GetAtt ProjectArtifactsBucket.Arn, '/*'] ]
182 | - 'arn:aws:s3:::sagemaker-edge-release-store-us-west-2-linux-x64/*'
183 | - 'arn:aws:s3:::sagemaker-edge-release-store-us-west-2-linux-x64'
184 | PolicyName: !Sub CustomResourceLambdaPolicy-${ProjectName}
185 | ManagedPolicyArns:
186 | - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
187 |
188 | PrepareDevicePackageCustomResourceLambda:
189 | Type: AWS::Lambda::Function
190 | Properties:
191 | Role: !GetAtt CustomResourceLambdaRole.Arn
192 | FunctionName: !Sub PrepareDevicePackage-CfnCustomResource-${ProjectName}
193 | Runtime: python3.8
194 | Handler: index.lambda_handler
195 | Timeout: 15
196 | Code:
197 | ZipFile: |
198 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
199 | # SPDX-License-Identifier: MIT-0
200 | """
201 | Lambda-backed custom resource function to create the SageMaker Edge Manager device package.
202 | Support SageMaker Edge Agent Version:
203 | """
204 | import json
205 | import os
206 | import logging
207 | import stat
208 | from botocore.parsers import LOG
209 | import urllib3
210 | import boto3
211 | import tarfile
212 | import io
213 | from botocore.exceptions import ClientError
214 |
215 | http = urllib3.PoolManager()
216 |
217 | LOGGER = logging.getLogger()
218 | LOGGER.setLevel(logging.INFO)
219 |
220 | BUCKET_NAME = os.environ['BUCKET_NAME']
221 | PROJECT_NAME = os.environ['PROJECT_NAME']
222 | AWS_REGION = os.environ['AWS_REGION']
223 |
224 | LOCAL_DIR_PREFIX = '/tmp/' # Needed for running in AWS Lambda
225 |
226 | iot_client = boto3.client('iot')
227 | sm_client = boto3.client('sagemaker')
228 | s3_client = boto3.client('s3')
229 |
230 | # Global variables
231 | # This information needs to match with what was defined in the CloudFormation template
232 | sm_edge_device_name = 'edge-device-defect-detection-%s' % PROJECT_NAME
233 | iot_policy_name = 'defect-detection-policy-%s' % PROJECT_NAME
234 | iot_thing_name = 'edge-device-%s' % PROJECT_NAME
235 | iot_thing_group_name='defect-detection-%s-group' % PROJECT_NAME
236 | sm_em_fleet_name = 'defect-detection-%s' % PROJECT_NAME
237 | role_alias = 'SageMakerEdge-%s' % sm_em_fleet_name
238 |
239 |
240 | def cfn_cleanup():
241 | """Clean up resources created in the custom resources"""
242 |
243 | LOGGER.info('Deleting role alias if exists')
244 | try:
245 | iot_client.delete_role_alias(roleAlias=role_alias)
246 | except:
247 | LOGGER.info('Role alias deletion failed, continuing anyways')
248 |
249 | LOGGER.info('Deregistering device from edge fleet if exists')
250 | try:
251 | sm_client.deregister_devices(
252 | DeviceFleetName=sm_em_fleet_name,
253 | DeviceNames=[sm_edge_device_name]
254 | )
255 | except:
256 | LOGGER.info('Device deregistration failed, continuing anyways')
257 |
258 | LOGGER.info('Detaching certificates')
259 | try:
260 | cert_arn = iot_client.list_thing_principals(thingName=iot_thing_name)['principals'][0]
261 | cert_id = cert_arn.split('/')[-1]
262 | iot_client.detach_policy(policyName=iot_policy_name, target=cert_arn)
263 | iot_client.detach_thing_principal(thingName=iot_thing_name, principal=cert_arn)
264 | iot_client.update_certificate(certificateId=cert_id, newStatus='INACTIVE')
265 | iot_client.delete_certificate(certificateId=cert_id, forceDelete=True)
266 | iot_client.delete_thing_group(thingGroupName=iot_thing_group_name)
267 | except:
268 | LOGGER.info('Detaching certificates failed, continuing anyways')
269 |
270 |
271 |
272 |
273 | def lambda_handler(event, context):
274 | '''Handle Lambda event from AWS'''
275 |
276 | try:
277 | LOGGER.info('REQUEST RECEIVED:\n %s', event)
278 | LOGGER.info('REQUEST RECEIVED:\n %s', context)
279 | if event['RequestType'] == 'Create':
280 | LOGGER.info('CREATE!')
281 |
282 | LOGGER.info('Starting device packaging...')
283 | try:
284 | prepare_device_package(event, context)
285 | send_response(event, context, "SUCCESS",
286 | {"Message": "Resource creation successful!"})
287 | except Exception as e:
288 | send_response(event, context, "FAILED", {"Message": "Resource creation failed during device packaging!", "Error": str(e)})
289 | elif event['RequestType'] == 'Update':
290 | LOGGER.info('UPDATE!')
291 | send_response(event, context, "SUCCESS",
292 | {"Message": "Resource update successful!"})
293 | elif event['RequestType'] == 'Delete':
294 | LOGGER.info('DELETE!')
295 | # Start cleanup method
296 | cfn_cleanup()
297 | send_response(event, context, "SUCCESS",
298 | {"Message": "Resource deletion successful!"})
299 | else:
300 | LOGGER.info('FAILED!')
301 | send_response(event, context, "FAILED",
302 | {"Message": "Unexpected event received from CloudFormation"})
303 | except: #pylint: disable=W0702
304 | LOGGER.info('FAILED!')
305 | send_response(event, context, "FAILED", {
306 | "Message": "Exception during processing"})
307 |
308 |
309 | def send_response(event, context, response_status, response_data):
310 | '''Send a resource manipulation status response to CloudFormation'''
311 | response_body = json.dumps({
312 | "Status": response_status,
313 | "Reason": "See the details in CloudWatch Log Stream: " + context.log_stream_name,
314 | "PhysicalResourceId": context.log_stream_name,
315 | "StackId": event['StackId'],
316 | "RequestId": event['RequestId'],
317 | "LogicalResourceId": event['LogicalResourceId'],
318 | "Data": response_data
319 | })
320 |
321 | print("Response body:")
322 | print(response_body)
323 |
324 | response_url = event['ResponseURL']
325 |
326 | headers = {
327 | 'content-type' : '',
328 | 'content-length' : str(len(response_body))
329 | }
330 |
331 | try:
332 | response = http.request('PUT', response_url, headers=headers, body=response_body)
333 | print("Status code:", response.status)
334 |
335 | except Exception as e:
336 |
337 | print("send(..) failed executing http.request(..):", e)
338 |
339 |
340 | def setup_agent(thing_group_name, thing_group_arn):
341 | """Creates configuration file and sets up SageMaker Edge Agent for deployment
342 | onto a Amazon S3 bucket. Registers a device with a device fleet, creates IoT
343 | certificates and attaches them to the previously created IoT thing. Saves
344 | certificates onto local disk to make it ready for uploading to S3.
345 |
346 | Args:
347 | thing_group_name (string): a name for the IoT thing group
348 | thing_group_arn (string): the ARN of the IoT thing group
349 | """
350 |
351 | local_base_path = LOCAL_DIR_PREFIX + "agent/certificates/iot/edge_device_cert_%s.pem"
352 | relative_base_path = "agent/certificates/iot/edge_device_cert_%s.pem"
353 | thing_arn_template = thing_group_arn.replace('thinggroup', 'thing').replace(thing_group_name, '%s')
354 | cred_host = iot_client.describe_endpoint(endpointType='iot:CredentialProvider')['endpointAddress']
355 |
356 | # Check length of device name string
357 | if len(sm_edge_device_name) > 64:
358 | LOGGER.error("Device name for edge device is too long. Needs to be <64 characters.")
359 | raise ClientError('Device name for edge device is longer than 64 characters. Please choose a shorter value for ProjectName.')
360 |
361 | # register the device in the fleet
362 | # the device name needs to have 36 chars
363 | dev = [{'DeviceName': sm_edge_device_name, 'IotThingName': iot_thing_name}]
364 | try:
365 | sm_client.describe_device(DeviceFleetName=sm_em_fleet_name, DeviceName=sm_edge_device_name)
366 | LOGGER.info("Device was already registered on SageMaker Edge Manager")
367 | except ClientError as e:
368 | if e.response['Error']['Code'] != 'ValidationException': raise e
369 | LOGGER.info("Registering a new device %s on fleet %s" % (sm_edge_device_name, sm_em_fleet_name))
370 | sm_client.register_devices(DeviceFleetName=sm_em_fleet_name, Devices=dev)
371 | iot_client.add_thing_to_thing_group(
372 | thingGroupName=thing_group_name,
373 | thingGroupArn=thing_group_arn,
374 | thingName=iot_thing_name,
375 | thingArn=thing_arn_template % iot_thing_name
376 | )
377 |
378 | # if you reach this point you need to create new certificates
379 | # generate the certificates
380 | cert = local_base_path % ('cert')
381 | key = local_base_path % ('pub')
382 | pub = local_base_path % ('key')
383 |
384 | # Relative paths needed for setting path in config file
385 | cert_relative = relative_base_path % ('cert')
386 | key_relative = relative_base_path % ('pub')
387 | pub_relative = relative_base_path % ('key')
388 |
389 | cert_meta=iot_client.create_keys_and_certificate(setAsActive=True)
390 | cert_arn = cert_meta['certificateArn']
391 | with open(cert, 'w') as c: c.write(cert_meta['certificatePem'])
392 | with open(key, 'w') as c: c.write(cert_meta['keyPair']['PrivateKey'])
393 | with open(pub, 'w') as c: c.write(cert_meta['keyPair']['PublicKey'])
394 |
395 | # attach the certificates to the policy and to the thing
396 | iot_client.attach_policy(policyName=iot_policy_name, target=cert_arn)
397 | iot_client.attach_thing_principal(thingName=iot_thing_name, principal=cert_arn)
398 |
399 | LOGGER.info("Creating agent config JSON file")
400 |
401 | # Please note that the $WORKDIR variables need to be replaced by the absolute path of the working directory of your project.
402 | # If you follow the guide, the install script will automatically replace those.
403 | agent_params = {
404 | "sagemaker_edge_core_device_name": sm_edge_device_name,
405 | "sagemaker_edge_core_device_fleet_name": sm_em_fleet_name,
406 | "sagemaker_edge_core_region": AWS_REGION,
407 | "sagemaker_edge_provider_provider": "Aws",
408 | "sagemaker_edge_provider_provider_path" : "$WORKDIR/agent/lib/libprovider_aws.so",
409 | "sagemaker_edge_core_root_certs_path": "$WORKDIR/agent/certificates/root",
410 | "sagemaker_edge_provider_aws_ca_cert_file": "$WORKDIR/agent/certificates/iot/AmazonRootCA1.pem",
411 | "sagemaker_edge_provider_aws_cert_file": "$WORKDIR/%s" % cert_relative,
412 | "sagemaker_edge_provider_aws_cert_pk_file": "$WORKDIR/%s" % key_relative,
413 | "sagemaker_edge_provider_aws_iot_cred_endpoint": "https://%s/role-aliases/%s/credentials" % (cred_host,role_alias),
414 | "sagemaker_edge_core_capture_data_destination": "Cloud",
415 | "sagemaker_edge_provider_s3_bucket_name": BUCKET_NAME,
416 | "sagemaker_edge_core_folder_prefix": "edge-agent-inference-data-capture",
417 | "sagemaker_edge_core_capture_data_buffer_size": 30,
418 | "sagemaker_edge_core_capture_data_batch_size": 10,
419 | "sagemaker_edge_core_capture_data_push_period_seconds": 10,
420 | "sagemaker_edge_core_capture_data_base64_embed_limit": 2,
421 | "sagemaker_edge_log_verbose": False
422 | }
423 | with open(LOCAL_DIR_PREFIX + 'agent/conf/config_edge_device.json', 'w') as conf:
424 | conf.write(json.dumps(agent_params, indent=4))
425 |
426 |
427 | def prepare_device_package(event, context):
428 | """Prepares the edge device package in a lambda function and uploads it to the S3 bucket"""
429 |
430 | # create a new thing group
431 | thing_group_arn = None
432 | agent_pkg_bucket = 'sagemaker-edge-release-store-us-west-2-linux-x64'
433 | agent_config_package_prefix = 'edge-device-configuration/agent/config.tgz'
434 |
435 | # check if edge agent package has already been built
436 | try:
437 | s3_client.download_file(Bucket=BUCKET_NAME, Key=agent_config_package_prefix, Filename='/tmp/dump')
438 | LOGGER.info('The agent configuration package was already built! Skipping...')
439 | quit()
440 | except ClientError as e:
441 | pass
442 |
443 | # Create a new thing group if not found yet
444 | try:
445 | thing_group_arn = iot_client.describe_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn']
446 | LOGGER.info("Thing group found")
447 | except iot_client.exceptions.ResourceNotFoundException as e:
448 | LOGGER.info("Creating a new thing group")
449 | thing_group_arn = iot_client.create_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn']
450 |
451 | LOGGER.info("Creating the directory structure for the agent")
452 | # create a structure for the agent files
453 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/root', exist_ok=True)
454 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/iot', exist_ok=True)
455 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/logs', exist_ok=True)
456 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/model', exist_ok=True)
457 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/conf', exist_ok=True)
458 |
459 | LOGGER.info("Downloading root certificate and agent binary")
460 | # then get some root certificates
461 | resp = http.request('GET', 'https://www.amazontrust.com/repository/AmazonRootCA1.pem')
462 | with open(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', 'w') as c:
463 | c.write(resp.data.decode('utf-8'))
464 |
465 | # this certificate validates the edge manage package
466 | s3_client.download_file(
467 | Bucket=agent_pkg_bucket,
468 | Key='Certificates/%s/%s.pem' % (AWS_REGION, AWS_REGION),
469 | Filename=LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION
470 | )
471 |
472 | LOGGER.info("Adjusting file permissions of pem files")
473 | # adjust the permissions of the files
474 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', stat.S_IRUSR|stat.S_IRGRP)
475 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION, stat.S_IRUSR|stat.S_IRGRP)
476 |
477 | LOGGER.info("Processing the agent...")
478 | setup_agent(iot_thing_group_name, thing_group_arn )
479 |
480 | LOGGER.info("Creating the final package...")
481 | with io.BytesIO() as f:
482 | with tarfile.open(fileobj=f, mode='w:gz') as tar:
483 | tar.add(LOCAL_DIR_PREFIX + 'agent', 'agent', recursive=True)
484 | f.seek(0)
485 | LOGGER.info("Uploading to S3")
486 | s3_client.upload_fileobj(f, Bucket=BUCKET_NAME, Key=agent_config_package_prefix)
487 | LOGGER.info("Done!")
488 | Environment:
489 | Variables:
490 | BUCKET_NAME: !Ref ProjectArtifactsBucket
491 | PROJECT_NAME: !Ref ProjectName
492 |
493 | PrepareDevicePackageCR:
494 | Type: Custom::PrepareDevicePackage
495 | Properties:
496 | ServiceToken: !GetAtt PrepareDevicePackageCustomResourceLambda.Arn
497 |
498 | Outputs:
499 | EdgeDeviceIoTThingOutput:
500 | Description: The edge device IoT Thing where SageMaker Edge Manager will be running
501 | Value: !Ref EdgeDeviceThing
502 | EdgeDeviceFleetOutput:
503 | Description: The edge device fleet of SageMaker Edge Manager which contains the IoT things
504 | Value: !Ref EdgeDeviceFleet
505 | EdgeDeviceRoleOutput:
506 | Description: The IAM role which is mapped to the edge device certificate
507 | Value: !Ref EdgeDeviceRole
508 | EdgeDeviceRoleAliasOutput:
509 | Description: The IoT role alias which connects the device certifacte to an IAM role
510 | Value: !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:rolealias/SageMakerEdge-defect-detection-${ProjectName}"
511 | ArtifactsBucketOutput:
512 | Description: The S3 bucket which contains the packaged edge agent configuration files
513 | Value: !Ref ProjectArtifactsBucket
514 |
--------------------------------------------------------------------------------