├── setup ├── .gitkeep ├── lambda-custom-resource │ └── prepare_dev_package_cr.py └── template.yaml ├── src ├── cloud │ ├── pipelines │ │ ├── semantic_segmentation │ │ │ ├── requirements.txt │ │ │ ├── __init__.py │ │ │ ├── train_tf.py │ │ │ ├── pipeline.py │ │ │ └── preprocessing.py │ │ ├── __init__.py │ │ ├── image_classification │ │ │ ├── __init__.py │ │ │ ├── evaluation.py │ │ │ ├── preprocessing.py │ │ │ └── pipeline.py │ │ ├── __version__.py │ │ ├── _utils.py │ │ ├── get_pipeline_definition.py │ │ └── run_pipeline.py │ ├── data_preparation.ipynb │ ├── semantic_segmentation_pipeline.ipynb │ └── image_classification_pipeline.ipynb └── edge │ ├── models_config.json │ ├── requirements.txt │ ├── app │ ├── __init__.py │ ├── util.py │ ├── logger.py │ ├── edgeagentclient.py │ └── ota.py │ ├── start_edge_agent.sh │ ├── templates │ ├── main_noimg.html │ ├── base.html │ └── main.html │ ├── install.py │ └── run.py ├── img ├── iot_job.png ├── pipeline.png ├── architecture.png ├── edge_config.png ├── inferece_ui.png ├── cloudformation.png └── kolektor_sdd2.png ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── CONTRIBUTING.md └── README.md /setup/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/cloud/pipelines/semantic_segmentation/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python -------------------------------------------------------------------------------- /img/iot_job.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/iot_job.png -------------------------------------------------------------------------------- /img/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/pipeline.png -------------------------------------------------------------------------------- /src/cloud/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /img/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/architecture.png -------------------------------------------------------------------------------- /img/edge_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/edge_config.png -------------------------------------------------------------------------------- /img/inferece_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/inferece_ui.png -------------------------------------------------------------------------------- /img/cloudformation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/cloudformation.png -------------------------------------------------------------------------------- /img/kolektor_sdd2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision/HEAD/img/kolektor_sdd2.png -------------------------------------------------------------------------------- /src/cloud/pipelines/image_classification/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /src/cloud/pipelines/semantic_segmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | __pycache__/ 3 | .ipynb_checkpoints 4 | 5 | # Ignore the data directory in git to not upload large files to repo 6 | src/cloud/data/ 7 | -------------------------------------------------------------------------------- /src/edge/models_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "image-classification-app": "img-classification", 4 | "image-segmentation-app": "unet" 5 | }, 6 | "models": [ 7 | ] 8 | } -------------------------------------------------------------------------------- /src/edge/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | Pillow==10.3.0 3 | sysv-ipc==1.1.0 4 | boto3==1.17.89 5 | grpcio-tools==1.38.0 6 | grpcio==1.53.2 7 | protobuf==3.18.3 8 | paho-mqtt==1.5.1 9 | waitress==3.0.1 10 | Flask==2.3.2 11 | -------------------------------------------------------------------------------- /src/edge/app/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 4 | 5 | from .edgeagentclient import EdgeAgentClient 6 | from .ota import OTAModelUpdate 7 | from .logger import Logger 8 | from .util import * -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /src/cloud/pipelines/__version__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | """Metadata for the pipelines package.""" 4 | 5 | __title__ = "sm-pipelines-defect-detection" 6 | __description__ = "Defect detection pipelines package" 7 | __version__ = "0.0.1" 8 | __author__ = "lichtend" 9 | __author_email__ = "lichtend@amazon.com" 10 | __license__ = "MIT" 11 | __url__ = "https://aws.amazon.com/" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /src/cloud/pipelines/_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | """Provides utilities for SageMaker Pipeline CLI.""" 4 | from __future__ import absolute_import 5 | 6 | import ast 7 | 8 | 9 | def get_pipeline_driver(module_name, passed_args=None): 10 | """Gets the driver for generating your pipeline definition. 11 | 12 | Pipeline modules must define a get_pipeline() module-level method. 13 | 14 | Args: 15 | module_name: The module name of your pipeline. 16 | passed_args: Optional passed arguments that your pipeline may be templated by. 17 | 18 | Returns: 19 | The SageMaker Workflow pipeline. 20 | """ 21 | _imports = __import__(module_name, fromlist=["get_pipeline"]) 22 | kwargs = convert_struct(passed_args) 23 | return _imports.get_pipeline(**kwargs) 24 | 25 | 26 | def convert_struct(str_struct=None): 27 | return ast.literal_eval(str_struct) if str_struct else {} 28 | -------------------------------------------------------------------------------- /src/edge/start_edge_agent.sh: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | #!/bin/bash 4 | if [ "$SM_EDGE_AGENT_HOME" == "" ]; then 5 | echo "You need to define the env var: SM_EDGE_AGENT_HOME" 6 | exit 7 | fi 8 | 9 | echo "SM_EDGE_AGENT_HOME: $SM_EDGE_AGENT_HOME" 10 | AGENT_PID_FILE='/tmp/edge_agent.pid' 11 | APP_PID_FILE='/tmp/edge_app.pid' 12 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 13 | 14 | if ! test -f "$AGENT_PID_FILE" || ! kill -0 $(cat $AGENT_PID_FILE) 2> /dev/null; then 15 | echo "Starting the agent" 16 | rm -f nohup.out /tmp/edge_agent 17 | nohup $SM_EDGE_AGENT_HOME/bin/sagemaker_edge_agent_binary -a /tmp/edge_agent -c $SM_EDGE_AGENT_HOME/conf/config_edge_device.json >> $SM_EDGE_AGENT_HOME/logs/agent.log 2>&1 & 18 | AGENT_PID=$! 19 | echo $AGENT_PID > $AGENT_PID_FILE 20 | fi 21 | echo "AGENT PID: $(cat $AGENT_PID_FILE)" 22 | 23 | echo "Note: Please verify that the edge agent is running by using the command \"ps aux | grep [s]agemaker_edge_agent_binary\". In case you do not see any process running, please check the log file \"$SM_EDGE_AGENT_HOME/logs/agent.log\"". -------------------------------------------------------------------------------- /src/edge/templates/main_noimg.html: -------------------------------------------------------------------------------- 1 | 3 | {% extends 'base.html' %} {% block header %} 4 | 5 | {% endblock %} {% block content %} 6 |
7 |
8 |
Edge Agent
9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | {% for m in loaded_models %} 20 | 21 | 22 | 23 | 24 | 25 | {% endfor %} 26 | 27 |
Model NameModel VersionModel Identifier
{{ m.name }}{{ m.version }}{{ m.identifier }}
28 |
29 |
30 | 31 | 36 | 37 | 38 |
39 | {% endblock %} 40 | -------------------------------------------------------------------------------- /src/edge/templates/base.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | SageMaker Edge Application 5 | 6 | 7 | 8 | 26 | 27 | 28 |
29 |
30 | {% block header %}{% endblock %} 31 |
32 | {% for message in get_flashed_messages() %} 33 |
{{ message }}
34 | {% endfor %} 35 | {% block content %}{% endblock %} 36 |
-------------------------------------------------------------------------------- /src/cloud/pipelines/get_pipeline_definition.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | """A CLI to get pipeline definitions from pipeline modules.""" 4 | from __future__ import absolute_import 5 | 6 | import argparse 7 | import sys 8 | 9 | from pipelines._utils import get_pipeline_driver 10 | 11 | 12 | def main(): # pragma: no cover 13 | """The main harness that gets the pipeline definition JSON. 14 | 15 | Prints the json to stdout or saves to file. 16 | """ 17 | parser = argparse.ArgumentParser("Gets the pipeline definition for the pipeline script.") 18 | 19 | parser.add_argument( 20 | "-n", 21 | "--module-name", 22 | dest="module_name", 23 | type=str, 24 | help="The module name of the pipeline to import.", 25 | ) 26 | parser.add_argument( 27 | "-f", 28 | "--file-name", 29 | dest="file_name", 30 | type=str, 31 | default=None, 32 | help="The file to output the pipeline definition json to.", 33 | ) 34 | parser.add_argument( 35 | "-kwargs", 36 | "--kwargs", 37 | dest="kwargs", 38 | default=None, 39 | help="Dict string of keyword arguments for the pipeline generation (if supported)", 40 | ) 41 | args = parser.parse_args() 42 | 43 | if args.module_name is None: 44 | parser.print_help() 45 | sys.exit(2) 46 | 47 | try: 48 | pipeline = get_pipeline_driver(args.module_name, args.kwargs) 49 | content = pipeline.definition() 50 | if args.file_name: 51 | with open(args.file_name, "w") as f: 52 | f.write(content) 53 | else: 54 | print(content) 55 | except Exception as e: # pylint: disable=W0703 56 | print(f"Exception: {e}") 57 | sys.exit(1) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /src/edge/app/util.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import numpy as np 4 | import boto3 5 | import requests 6 | import PIL 7 | import io 8 | import base64 9 | 10 | def create_dataset(X, time_steps=1, step=1): 11 | ''' 12 | Format a timeseries buffer into a multidimensional tensor 13 | required by the model 14 | ''' 15 | Xs = [] 16 | for i in range(0, len(X) - time_steps, step): 17 | v = X[i:(i + time_steps)] 18 | Xs.append(v) 19 | return np.array(Xs) 20 | 21 | def get_aws_credentials(cred_endpoint, thing_name, cert_file, key_file, ca_file): 22 | ''' 23 | Invoke SageMaker Edge Manager endpoint to exchange the certificates 24 | by temp credentials 25 | ''' 26 | resp = requests.get( 27 | cred_endpoint, 28 | cert=(cert_file, key_file, ca_file), 29 | ) 30 | if not resp: 31 | raise Exception('Error while getting the IoT credentials: ', resp) 32 | credentials = resp.json() 33 | return (credentials['credentials']['accessKeyId'], 34 | credentials['credentials']['secretAccessKey'], 35 | credentials['credentials']['sessionToken']) 36 | 37 | def get_client(service_name, iot_params): 38 | ''' 39 | Build a boto3 client of a given service 40 | It uses the temp credentials exchanged by the certificates 41 | ''' 42 | access_key_id,secret_access_key,session_token = get_aws_credentials( 43 | iot_params['sagemaker_edge_provider_aws_iot_cred_endpoint'], 44 | iot_params['sagemaker_edge_core_device_name'], 45 | iot_params['sagemaker_edge_provider_aws_cert_file'], 46 | iot_params['sagemaker_edge_provider_aws_cert_pk_file'], 47 | iot_params['sagemaker_edge_provider_aws_ca_cert_file'] 48 | ) 49 | return boto3.client( 50 | service_name, iot_params['sagemaker_edge_core_region'], 51 | aws_access_key_id=access_key_id, 52 | aws_secret_access_key=secret_access_key, 53 | aws_session_token=session_token 54 | ) 55 | 56 | def create_b64_img_from_mask(mask): 57 | """Creates binary stream from (1, SIZE, SIZE)-shaped binary mask""" 58 | img_size = mask.shape[1] 59 | mask_reshaped = np.reshape(mask, (img_size, img_size)) 60 | img = PIL.Image.fromarray(np.uint8(mask_reshaped)*255) 61 | img_binary = io.BytesIO() 62 | img.save(img_binary, 'PNG') 63 | img_b64 = base64.b64encode(img_binary.getvalue()) 64 | return img_b64 -------------------------------------------------------------------------------- /src/edge/app/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import threading 4 | import json 5 | import logging 6 | import app.util as util 7 | 8 | IOT_BASE_TOPIC = 'edge-manager-app' 9 | 10 | class Logger(object): 11 | def __init__(self, device_name, iot_params): 12 | ''' 13 | This class is responsible for sending application logs 14 | to the cloud via MQTT and IoT Topics 15 | ''' 16 | self.device_name = device_name 17 | logging.info("Device Name: %s" % self.device_name) 18 | self.iot_params = iot_params 19 | 20 | self.__update_credentials() 21 | 22 | self.logs_buffer = [] 23 | self.__log_lock = threading.Lock() 24 | 25 | def __update_credentials(self): 26 | ''' 27 | Get new temp credentials 28 | ''' 29 | logging.info("Getting the IoT Credentials") 30 | self.iot_data_client = util.get_client('iot-data', self.iot_params) 31 | 32 | def __run_logs_upload_job__(self): 33 | ''' 34 | Launch a thread that will read the logs buffer 35 | prepare a json document and send the logs 36 | ''' 37 | self.cloud_log_sync_job = threading.Thread(target=self.__upload_logs__) 38 | self.cloud_log_sync_job.start() 39 | 40 | def __upload_logs__(self): 41 | ''' 42 | Invoked by the thread to publish the latest logs 43 | ''' 44 | self.__log_lock.acquire(True) 45 | f = json.dumps({'logs': self.logs_buffer}) 46 | self.logs_buffer = [] # clean the buffer 47 | try: 48 | self.iot_data_client.publish( topic='%s/logs/%s' % (IOT_BASE_TOPIC, self.device_name), payload=f.encode('utf-8') ) 49 | except Exception as e: 50 | logging.error(e) 51 | self.__update_credentials() 52 | self.iot_data_client.publish( topic='%s/logs/%s' % (IOT_BASE_TOPIC, self.device_name), payload=f.encode('utf-8') ) 53 | 54 | logging.info("New log file uploaded. len: %d" % len(f)) 55 | self.__log_lock.release() 56 | 57 | def publish_logs(self, data): 58 | ''' 59 | Invoked by the application, it buffers the logs 60 | ''' 61 | buffer_len = 0 62 | if self.__log_lock.acquire(False): 63 | self.logs_buffer.append(data) 64 | buffer_len = len(self.logs_buffer) 65 | self.__log_lock.release() 66 | # else: job is running, discard the new data 67 | if buffer_len > 10: 68 | # run the sync job 69 | self.__run_logs_upload_job__() -------------------------------------------------------------------------------- /src/cloud/pipelines/run_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | """A CLI to create or update and run pipelines.""" 4 | from __future__ import absolute_import 5 | 6 | import argparse 7 | import json 8 | import sys 9 | 10 | from pipelines._utils import get_pipeline_driver, convert_struct 11 | 12 | 13 | def main(): # pragma: no cover 14 | """The main harness that creates or updates and runs the pipeline. 15 | 16 | Creates or updates the pipeline and runs it. 17 | """ 18 | parser = argparse.ArgumentParser( 19 | "Creates or updates and runs the pipeline for the pipeline script." 20 | ) 21 | 22 | parser.add_argument( 23 | "-n", 24 | "--module-name", 25 | dest="module_name", 26 | type=str, 27 | help="The module name of the pipeline to import.", 28 | ) 29 | parser.add_argument( 30 | "-kwargs", 31 | "--kwargs", 32 | dest="kwargs", 33 | default=None, 34 | help="Dict string of keyword arguments for the pipeline generation (if supported)", 35 | ) 36 | parser.add_argument( 37 | "-role-arn", 38 | "--role-arn", 39 | dest="role_arn", 40 | type=str, 41 | help="The role arn for the pipeline service execution role.", 42 | ) 43 | parser.add_argument( 44 | "-description", 45 | "--description", 46 | dest="description", 47 | type=str, 48 | default=None, 49 | help="The description of the pipeline.", 50 | ) 51 | parser.add_argument( 52 | "-tags", 53 | "--tags", 54 | dest="tags", 55 | default=None, 56 | help="""List of dict strings of '[{"Key": "string", "Value": "string"}, ..]'""", 57 | ) 58 | args = parser.parse_args() 59 | 60 | if args.module_name is None or args.role_arn is None: 61 | parser.print_help() 62 | sys.exit(2) 63 | tags = convert_struct(args.tags) 64 | 65 | try: 66 | pipeline = get_pipeline_driver(args.module_name, args.kwargs) 67 | print("###### Creating/updating a SageMaker Pipeline with the following definition:") 68 | parsed = json.loads(pipeline.definition()) 69 | print(json.dumps(parsed, indent=2, sort_keys=True)) 70 | 71 | upsert_response = pipeline.upsert( 72 | role_arn=args.role_arn, description=args.description, tags=tags 73 | ) 74 | print("\n###### Created/Updated SageMaker Pipeline: Response received:") 75 | print(upsert_response) 76 | 77 | execution = pipeline.start() 78 | print(f"\n###### Execution started with PipelineExecutionArn: {execution.arn}") 79 | 80 | print("Waiting for the execution to finish...") 81 | execution.wait() 82 | print("\n#####Execution completed. Execution step details:") 83 | 84 | print(execution.list_steps()) 85 | # Todo print the status? 86 | except Exception as e: # pylint: disable=W0703 87 | print(f"Exception: {e}") 88 | sys.exit(1) 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /src/edge/templates/main.html: -------------------------------------------------------------------------------- 1 | 3 | {% extends 'base.html' %} {% block header %} 4 | 5 | {% endblock %} {% block content %} 6 |
7 |
8 |
Edge Agent
9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | {% for m in loaded_models %} 20 | 21 | 22 | 23 | 24 | 25 | {% endfor %} 26 | 27 |
Model NameModel VersionModel Identifier
{{ m.name }}{{ m.version }}{{ m.identifier }}
28 |
29 |
30 | 31 |
32 |
Camera Stream
33 |
34 |
Filename: {{ image_file }}
35 |
36 | ... 43 |
44 |
45 |
46 | 47 |
48 |
Model Predictions
49 |
50 |
51 |
Image Classification
52 | {% if (y_clf_class == 'normal') %} 53 |
Latency: {{ latency_clf }} ms
54 | 66 | {% elif (y_clf_class == 'anomalous') %} 67 |
Latency: {{ latency_clf }} ms
68 | 80 | {% else %} 81 |

No image classification result available

82 | {% endif %} 83 |
84 | 85 |
86 |
Semantic Segmentation
87 | {% if y_segm_img %} 88 |
Latency: {{ latency_segm }} ms
89 |
90 | 91 |
92 | {% else %} 93 |

No segmentation mask available

94 | {% endif %} 95 |
96 |
97 |
98 |
99 | {% endblock %} 100 | -------------------------------------------------------------------------------- /src/edge/install.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import boto3 4 | import os 5 | import tarfile 6 | import stat 7 | import io 8 | import logging 9 | import argparse 10 | import pathlib 11 | 12 | logger = logging.getLogger(__name__) 13 | logging.basicConfig(level=logging.INFO) 14 | 15 | s3_client = boto3.client('s3') 16 | 17 | # Default bucket for downloading the SM Edge Agent. Please note that your device needs access to this bucket through IAM 18 | agent_config_package_prefix = 'edge-device-configuration/agent/config.tgz' 19 | agent_version = '1.20210820.e20fa3a' 20 | agent_pkg_bucket = 'sagemaker-edge-release-store-us-west-2-linux-x64' 21 | 22 | def replace_pathnames_in_config(configfile): 23 | """Replaces the pathnames in the agent config to use absolute paths""" 24 | # Read in the file 25 | with open(configfile, 'r') as file : 26 | filedata = file.read() 27 | 28 | # Replace the target string 29 | basepath = str(pathlib.Path().resolve()) 30 | filedata = filedata.replace('$WORKDIR', basepath) 31 | 32 | # Write the file out again 33 | with open(configfile, 'w') as file: 34 | file.write(filedata) 35 | 36 | def download_config(bucket_name): 37 | # Check if agent is installed and configured already 38 | if not os.path.isdir('agent'): 39 | logger.info('No SM Edge Agent directory found. Proceeding with download of configuration package...') 40 | 41 | # Get the configuration package with certificates and config files 42 | with io.BytesIO() as file: 43 | s3_client.download_fileobj(bucket_name, agent_config_package_prefix, file) 44 | file.seek(0) 45 | # Extract the files 46 | tar = tarfile.open(fileobj=file) 47 | tar.extractall('.') 48 | tar.close() 49 | 50 | # Replace the variables in the config file to make paths absolute 51 | logger.info('Replacing path names in Edge Agent configuration file...') 52 | replace_pathnames_in_config('./agent/conf/config_edge_device.json') 53 | 54 | # Download and install SageMaker Edge Manager 55 | agent_pkg_key = 'Releases/%s/%s.tgz' % (agent_version, agent_version) 56 | # get the agent package 57 | logger.info('Downloading and installing SageMaker Edge Agent binaries version \"%s\"...' % agent_version) 58 | 59 | with io.BytesIO() as file: 60 | s3_client.download_fileobj(agent_pkg_bucket, agent_pkg_key, file) 61 | file.seek(0) 62 | # Extract the files 63 | tar = tarfile.open(fileobj=file) 64 | tar.extractall('agent') 65 | tar.close() 66 | # Adjust the permissions 67 | os.chmod('agent/bin/sagemaker_edge_agent_binary', stat.S_IXUSR|stat.S_IWUSR|stat.S_IXGRP|stat.S_IWGRP) 68 | 69 | # Finally, create SM Edge Agent client stubs, using protobuffer compiler 70 | logger.info('Creating protobuf agent stubs...') 71 | os.system('mkdir -p app/') 72 | os.system('python3 -m grpc_tools.protoc --proto_path=agent/docs/api --python_out=app/ --grpc_python_out=app/ agent/docs/api/agent.proto') 73 | 74 | if __name__ == '__main__': 75 | parser =argparse.ArgumentParser() 76 | parser.add_argument('--project-name', type=str, required=True) 77 | parser.add_argument('--account-id', type=str, required=True) 78 | args, _ = parser.parse_known_args() 79 | 80 | logger.info('Preparing device...') 81 | 82 | # Infer bucket name from project name and AWS Account ID as created in the CloudFormation template 83 | bucket_name = 'sm-edge-workshop-%s-%s' % (args.project_name, args.account_id) 84 | 85 | # Run the installation script 86 | download_config(bucket_name) 87 | 88 | logger.info('Done!') 89 | 90 | -------------------------------------------------------------------------------- /src/cloud/pipelines/image_classification/evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import sys 4 | import os 5 | import subprocess 6 | 7 | # Install packages previous to executing the rest of the script. You can also build your own custom container 8 | # with your individal dependecies if needed 9 | subprocess.check_call([sys.executable, "-m", "pip", "install", "mxnet", "opencv-python"]) 10 | os.system("apt-get update") 11 | os.system("apt-get install ffmpeg libsm6 libxext6 -y") 12 | 13 | import argparse 14 | import json 15 | import warnings 16 | import logging 17 | import pandas as pd 18 | import numpy as np 19 | from glob import glob 20 | from datetime import datetime 21 | import tarfile 22 | from PIL import Image 23 | from glob import glob 24 | import re 25 | 26 | import mxnet as mx 27 | import mxnet.ndarray as nd 28 | from mxnet import nd, gluon 29 | from mxnet.gluon.data.vision import transforms 30 | from sklearn.metrics import accuracy_score, classification_report, roc_auc_score 31 | 32 | # Constants 33 | 34 | # The images size used 35 | 36 | CLASS_LABELS = ['good', 'bad'] 37 | 38 | logger = logging.getLogger() 39 | logger.setLevel(logging.INFO) 40 | logger.addHandler(logging.StreamHandler()) 41 | 42 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() 43 | 44 | if __name__=='__main__': 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--image-width', type=int, default=224) 47 | parser.add_argument('--image-height', type=int, default=224) 48 | args, _ = parser.parse_known_args() 49 | 50 | logger.info('Received arguments {}'.format(args)) 51 | 52 | # Define the paths 53 | test_data_base_path = '/opt/ml/processing/test' 54 | model_data_base_path = '/opt/ml/processing/model' 55 | report_output_base_path = '/opt/ml/processing/report' 56 | 57 | IMAGE_WIDTH = int(args.image_width) 58 | IMAGE_HEIGHT = int(args.image_height) 59 | 60 | # Unzipping the model 61 | model_filename = 'model.tar.gz' 62 | model_path = os.path.join(model_data_base_path, model_filename) 63 | model_path_extracted = './model/' 64 | 65 | with tarfile.open(model_path) as tar: 66 | tar.extractall(path=model_path_extracted) 67 | 68 | # Get the files needed for loading, parse some strings 69 | symbol_file = glob(os.path.join(model_path_extracted, '*symbol.json'))[0] 70 | params_file = glob(os.path.join(model_path_extracted, '*.params'))[0] 71 | 72 | logger.info('Symbol file: %s' % symbol_file) 73 | logger.info('Params file: %s' % params_file) 74 | 75 | symbol_filename = os.path.basename(symbol_file) 76 | params_filename = os.path.basename(params_file) 77 | 78 | # Extract name and epoch needed for loading 79 | model_name = re.search(r".+(?=-symbol\.json)", symbol_filename).group(0) 80 | epoch = int(re.search(r"[0-9]+(?=\.params)", params_filename).group(0)) 81 | 82 | # Loading model 83 | logger.info('Loading model from artifacts...') 84 | sym, arg_params, aux_params = mx.model.load_checkpoint(os.path.join(model_path_extracted, model_name), epoch) 85 | model = mx.mod.Module(symbol=sym, context=mx.cpu(), label_names=['data']) 86 | model.bind(for_training=False, data_shapes=[('data', (1,3,IMAGE_WIDTH,IMAGE_HEIGHT))], 87 | label_shapes=model._label_shapes) 88 | model.set_params(arg_params, aux_params, allow_missing=True) 89 | 90 | # Load test data into record iterator (batch size 1) 91 | test_data = mx.io.ImageRecordIter( 92 | path_imgrec = os.path.join(test_data_base_path, 'test.rec'), 93 | data_shape = (3, IMAGE_WIDTH, IMAGE_HEIGHT), 94 | batch_size = 1, 95 | shuffle = True 96 | ) 97 | 98 | # Lists for the predicted and true labels 99 | y_true = [] 100 | y_pred = [] 101 | 102 | # For each batch (size=1) predict the class 103 | # TODO: make batch prediction work 104 | for batch in test_data: 105 | res = model.predict(eval_data=batch.data[0]) 106 | pred_class = int(np.argmax(res[0]).asnumpy()[0]) 107 | y_pred.append(pred_class) 108 | y_true.append(int(batch.label[0].asnumpy())) 109 | 110 | clf_report = classification_report(y_true, y_pred, target_names=CLASS_LABELS, output_dict=True) 111 | accuracy = accuracy_score(y_true, y_pred) 112 | 113 | # Save the preprocessing report to make information available to downstream steps 114 | evaluation_report = { 115 | 'multiclass_classification_metrics': { 116 | 'accuracy': { 117 | 'value': accuracy, 118 | 'standard_deviation': 'NaN' 119 | }, 120 | 'weighted_recall': { 121 | 'value': clf_report['weighted avg']['recall'], 122 | 'standard_deviation': 'NaN' 123 | }, 124 | 'weighted_precision': { 125 | 'value': clf_report['weighted avg']['precision'], 126 | 'standard_deviation': 'NaN' 127 | }, 128 | 'weighted_f1': { 129 | 'value': clf_report['weighted avg']['f1-score'], 130 | 'standard_deviation': 'NaN' 131 | } 132 | }, 133 | 'classification_report': clf_report 134 | } 135 | print('Evaluation report:', evaluation_report) 136 | report_output_path = os.path.join(report_output_base_path, 'evaluation_report.json') 137 | with open(report_output_path, "w") as f: 138 | f.write(json.dumps(evaluation_report)) 139 | -------------------------------------------------------------------------------- /src/edge/app/edgeagentclient.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | # From: https://github.com/aws-samples/amazon-sagemaker-edge-manager-demo/blob/main/04_EdgeApplication/turbine/edgeagentclient.py 4 | 5 | import grpc 6 | import logging 7 | import app.agent_pb2 as agent 8 | import app.agent_pb2_grpc as agent_grpc 9 | import struct 10 | import numpy as np 11 | import uuid 12 | 13 | class EdgeAgentClient(object): 14 | """ Helper class that uses the Edge Agent stubs to 15 | communicate with the SageMaker Edge Agent through unix socket. 16 | 17 | To generate the stubs you need to use protoc. First install/update: 18 | pip3 install -U grpcio-tools grpcio protobuf 19 | then generate the code using the provided agent.proto file 20 | 21 | python3 -m grpc_tools.protoc \ 22 | --proto_path=$PWD/agent/docs/api --python_out=./app --grpc_python_out=./app $PWD/agent/docs/api/agent.proto 23 | 24 | """ 25 | def __init__(self, channel_path): 26 | # connect to the agent and list the models 27 | self.channel = grpc.insecure_channel('unix://%s' % channel_path ) 28 | self.agent = agent_grpc.AgentStub(self.channel) 29 | self.model_map = {} 30 | 31 | def __update_models_list__(self): 32 | models_list = self.agent.ListModels(agent.ListModelsRequest()) 33 | self.model_map = {m.name:{'in': m.input_tensor_metadatas, 'out': m.output_tensor_metadatas} for m in models_list.models} 34 | return self.model_map 35 | 36 | def capture_data(self, model_name, input_data, output_data): 37 | """The CaptureData request to the edge agent""" 38 | try: 39 | logging.info('Capturing data for model %s' % model_name) 40 | req = agent.CaptureDataRequest() 41 | req.model_name = model_name 42 | req.capture_id = str(uuid.uuid4()) 43 | req.input_tensors.append( self.create_tensor(input_data, 'input')) 44 | req.output_tensors.append( self.create_tensor(output_data, 'output')) 45 | resp = self.agent.CaptureData(req) 46 | except Exception as e: 47 | logging.error('Error in capture_data: %s' % e) 48 | 49 | def create_tensor(self, x, tensor_name): 50 | """Creates a Edge agent tensor from a numpy float32 array""" 51 | if (x.dtype != np.float32): 52 | raise Exception( "It only supports numpy float32 arrays for this tensor but type for tensor %s was %s" % (tensor_name, x.dtype)) 53 | tensor = agent.Tensor() 54 | tensor.tensor_metadata.name = tensor_name.encode() 55 | tensor.tensor_metadata.data_type = agent.FLOAT32 56 | for s in x.shape: tensor.tensor_metadata.shape.append(s) 57 | tensor.byte_data = x.tobytes() 58 | return tensor 59 | 60 | def predict(self, model_name, x, shm=False): 61 | """ 62 | Invokes the model and get the predictions 63 | """ 64 | try: 65 | if self.model_map.get(model_name) is None: 66 | raise Exception('Model %s not loaded' % model_name) 67 | # Create a request 68 | req = agent.PredictRequest() 69 | req.name = model_name 70 | # Then load the data into a temp Tensor 71 | tensor = agent.Tensor() 72 | meta = self.model_map[model_name]['in'][0] 73 | tensor.tensor_metadata.name = meta.name 74 | tensor.tensor_metadata.data_type = meta.data_type 75 | for s in meta.shape: tensor.tensor_metadata.shape.append(s) 76 | 77 | if shm: 78 | tensor.shared_memory_handle.offset = 0 79 | tensor.shared_memory_handle.segment_id = x 80 | else: 81 | tensor.byte_data = x.astype(np.float32).tobytes() 82 | 83 | req.tensors.append(tensor) 84 | 85 | # Invoke the model 86 | resp = self.agent.Predict(req) 87 | 88 | # Parse the output 89 | meta = self.model_map[model_name]['out'][0] 90 | tensor = resp.tensors[0] 91 | data = np.frombuffer(tensor.byte_data, dtype=np.float32) 92 | return data.reshape(tensor.tensor_metadata.shape) 93 | except Exception as e: 94 | logging.error('Error in predict: %s' % e) 95 | return None 96 | 97 | def is_model_loaded(self, model_name): 98 | return self.model_map.get(model_name) is not None 99 | 100 | def load_model(self, model_name, model_path): 101 | """ Load a new model into the Edge Agent if not loaded yet""" 102 | try: 103 | if self.is_model_loaded(model_name): 104 | logging.info( "Model %s was already loaded" % model_name ) 105 | return self.model_map 106 | req = agent.LoadModelRequest() 107 | req.url = model_path 108 | req.name = model_name 109 | resp = self.agent.LoadModel(req) 110 | 111 | return self.__update_models_list__() 112 | except Exception as e: 113 | logging.error('Error in load_model: %s' % e) 114 | return None 115 | 116 | def unload_model(self, model_name): 117 | """ UnLoad model from the Edge Agent""" 118 | try: 119 | if not self.is_model_loaded(model_name): 120 | logging.info( "Model %s was not loaded" % model_name ) 121 | return self.model_map 122 | 123 | req = agent.UnLoadModelRequest() 124 | req.name = model_name 125 | resp = self.agent.UnLoadModel(req) 126 | 127 | return self.__update_models_list__() 128 | except Exception as e: 129 | logging.error('Error in unload_model: %s' % e) 130 | return None -------------------------------------------------------------------------------- /src/cloud/data_preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Defect Detection at the edge using Amazon SageMaker - Data preparation and preprocessing\n", 7 | "In this notebook, we will download the dataset and preprocess it accordingly to be used with the provided training pipelines." 8 | ], 9 | "metadata": {} 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "source": [ 15 | "import boto3\n", 16 | "import time\n", 17 | "import uuid\n", 18 | "import json\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "from PIL import Image\n", 22 | "import glob, os\n", 23 | "from shutil import copyfile\n", 24 | "import sagemaker\n", 25 | "\n", 26 | "sts_client = boto3.client('sts')\n", 27 | "\n", 28 | "# Get the account id\n", 29 | "account_id = sts_client.get_caller_identity()[\"Account\"]\n", 30 | "\n", 31 | "# Project Name as defined in your CloudFormation template\n", 32 | "PROJECT_NAME = ''\n", 33 | "\n", 34 | "region = boto3.Session().region_name\n", 35 | "role = sagemaker.get_execution_role()\n", 36 | "bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)" 37 | ], 38 | "outputs": [], 39 | "metadata": {} 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "source": [ 45 | "# Download the dataset\n", 46 | "!mkdir ./data\n", 47 | "!wget -P ./data http://go.vicos.si/kolektorsdd2" 48 | ], 49 | "outputs": [], 50 | "metadata": {} 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "source": [ 56 | "# Extract it\n", 57 | "!unzip ./data/kolektorsdd2 -d ./data/kolektor " 58 | ], 59 | "outputs": [], 60 | "metadata": {} 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "source": [ 66 | "# Define some utilities\n", 67 | "\n", 68 | "def img_read(path):\n", 69 | " \"\"\"Read image as numpy array\"\"\"\n", 70 | " with Image.open(path) as i:\n", 71 | " img = np.asarray(i)\n", 72 | " return img\n", 73 | "\n", 74 | "def img_is_anomalous(img):\n", 75 | " \"\"\"Assess whether an image is anomalous by assuming non-black masks are anomalous\"\"\"\n", 76 | " if np.mean(img) > 0:\n", 77 | " return True\n", 78 | " else:\n", 79 | " return False\n", 80 | " \n", 81 | "def sort_img_by_mask(mask_file, dir_normal, dir_anomalous):\n", 82 | " \"\"\"Copy file into specified directories based on mask\"\"\"\n", 83 | " mask_img = img_read(mask_file)\n", 84 | " data_img = mask_file.replace('_GT', '')\n", 85 | " if img_is_anomalous(mask_img):\n", 86 | " copyfile(data_img, os.path.join(dir_anomalous, os.path.basename(data_img)))\n", 87 | " else:\n", 88 | " copyfile(data_img, os.path.join(dir_normal, os.path.basename(data_img)))\n", 89 | " return" 90 | ], 91 | "outputs": [], 92 | "metadata": {} 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "source": [ 98 | "# Define the base directory where the files are located and get a list of all the maks files\n", 99 | "directory = './data/kolektor/train/'\n", 100 | "mask_files = [f for f in glob.glob(os.path.join(directory, '*_GT.png'))]" 101 | ], 102 | "outputs": [], 103 | "metadata": {} 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "source": [ 109 | "# Create folders for the preprocessed images\n", 110 | "!mkdir ./data/kolektor-preprocessed\n", 111 | "!mkdir ./data/kolektor-preprocessed/img-classification\n", 112 | "!mkdir ./data/kolektor-preprocessed/img-classification/normal\n", 113 | "!mkdir ./data/kolektor-preprocessed/img-classification/anomalous\n", 114 | "\n", 115 | "!mkdir ./data/kolektor-preprocessed/semantic-segmentation\n", 116 | "!mkdir ./data/kolektor-preprocessed/semantic-segmentation/images\n", 117 | "!mkdir ./data/kolektor-preprocessed/semantic-segmentation/masks" 118 | ], 119 | "outputs": [], 120 | "metadata": {} 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "source": [ 126 | "# Read the files and sort them by mask file. If the mask file is just black, we assume that there is no anomaly and thus categorize it as \"normal\"\n", 127 | "\n", 128 | "dir_normal = './data/kolektor-preprocessed/img-classification/normal'\n", 129 | "dir_anomalous = './data/kolektor-preprocessed/img-classification/anomalous'\n", 130 | "\n", 131 | "for mask_file in mask_files:\n", 132 | " sort_img_by_mask(mask_file, dir_normal, dir_anomalous)" 133 | ], 134 | "outputs": [], 135 | "metadata": {} 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "source": [ 141 | "# Sort the files into different folders for their masks and base images\n", 142 | "\n", 143 | "all_files = [f for f in glob.glob(os.path.join(directory, '*.png'))]\n", 144 | "dir_images = './data/kolektor-preprocessed/semantic-segmentation/images'\n", 145 | "dir_masks = './data/kolektor-preprocessed/semantic-segmentation/masks'\n", 146 | "\n", 147 | "for img_path in all_files:\n", 148 | " if '_GT' in img_path:\n", 149 | " # image is mask, sort into mask subdirectory\n", 150 | " copyfile(img_path, os.path.join(dir_masks, os.path.basename(img_path).replace('_GT', '')))\n", 151 | " else:\n", 152 | " copyfile(img_path, os.path.join(dir_images, os.path.basename(img_path)))" 153 | ], 154 | "outputs": [], 155 | "metadata": {} 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "source": [ 161 | "# Copy to S3 bucket\n", 162 | "!aws s3 cp --recursive --quiet ./data/kolektor-preprocessed/ s3://$bucket_name/data/" 163 | ], 164 | "outputs": [], 165 | "metadata": {} 166 | } 167 | ], 168 | "metadata": { 169 | "instance_type": "ml.t3.medium", 170 | "kernelspec": { 171 | "display_name": "Python 3 (Data Science)", 172 | "language": "python", 173 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.7.10" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 4 190 | } 191 | -------------------------------------------------------------------------------- /src/cloud/pipelines/semantic_segmentation/train_tf.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import argparse 4 | import numpy as np 5 | import os 6 | from glob import glob 7 | import cv2 8 | import tensorflow as tf 9 | from tensorflow import keras 10 | import pandas as pd 11 | from tensorflow.keras.layers import Conv2D, Activation, BatchNormalization 12 | from tensorflow.keras.layers import UpSampling2D, Input, Concatenate 13 | from tensorflow.keras.models import Model 14 | from tensorflow.keras.applications import MobileNetV2 15 | from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau 16 | from tensorflow.keras.metrics import Recall, Precision 17 | from tensorflow.keras import backend as K 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 19 | 20 | IMAGE_WIDTH=224 21 | IMAGE_HEIGHT=224 22 | 23 | def parse_args(): 24 | 25 | parser = argparse.ArgumentParser() 26 | 27 | # hyperparameters sent by the client are passed as command-line arguments to the script 28 | parser.add_argument('--epochs', type=int, default=100) 29 | parser.add_argument('--batch_size', type=int, default=8) 30 | parser.add_argument('--learning_rate', type=float, default=1e-4) 31 | 32 | # data directories 33 | parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN')) 34 | parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION')) 35 | 36 | # model directory: we will use the default set by SageMaker, /opt/ml/model 37 | parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR')) 38 | 39 | return parser.parse_known_args() 40 | 41 | def read_image(path): 42 | path = path.decode() 43 | x = cv2.imread(path, cv2.IMREAD_COLOR) 44 | x = cv2.resize(x, (IMAGE_WIDTH, IMAGE_HEIGHT)) 45 | x = x/255.0 46 | return x 47 | 48 | def read_mask(path): 49 | path = path.decode() 50 | x = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 51 | x = cv2.resize(x, (IMAGE_WIDTH, IMAGE_HEIGHT)) 52 | x = x/255.0 53 | x = np.expand_dims(x, axis=-1) 54 | return x 55 | 56 | def tf_parse(x, y): 57 | def _parse(x, y): 58 | x = read_image(x) 59 | y = read_mask(y) 60 | return x, y 61 | 62 | x, y = tf.numpy_function(_parse, [x, y], [tf.float64, tf.float64]) 63 | x.set_shape([IMAGE_WIDTH, IMAGE_HEIGHT, 3]) 64 | y.set_shape([IMAGE_WIDTH, IMAGE_HEIGHT, 1]) 65 | return x, y 66 | 67 | def tf_dataset(x, y, batch=8): 68 | dataset = tf.data.Dataset.from_tensor_slices((x, y)) 69 | dataset = dataset.map(tf_parse) 70 | dataset = dataset.batch(batch) 71 | dataset = dataset.repeat() 72 | return dataset 73 | 74 | 75 | def model(): 76 | inputs = Input(shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 3), name="input_image") 77 | 78 | encoder = MobileNetV2(input_tensor=inputs, weights="imagenet", include_top=False, alpha=0.35) 79 | skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"] 80 | encoder_output = encoder.get_layer("block_13_expand_relu").output 81 | 82 | f = [16, 32, 48, 64] 83 | x = encoder_output 84 | for i in range(1, len(skip_connection_names)+1, 1): 85 | x_skip = encoder.get_layer(skip_connection_names[-i]).output 86 | x = UpSampling2D((2, 2))(x) 87 | x = Concatenate()([x, x_skip]) 88 | 89 | x = Conv2D(f[-i], (3, 3), padding="same")(x) 90 | x = BatchNormalization()(x) 91 | x = Activation("relu")(x) 92 | 93 | x = Conv2D(f[-i], (3, 3), padding="same")(x) 94 | x = BatchNormalization()(x) 95 | x = Activation("relu")(x) 96 | 97 | x = Conv2D(1, (1, 1), padding="same")(x) 98 | x = Activation("sigmoid")(x) 99 | 100 | model = Model(inputs, x) 101 | return model 102 | 103 | 104 | def dice_coef(y_true, y_pred): 105 | smooth = 1e-15 106 | y_true = tf.keras.layers.Flatten()(y_true) 107 | y_pred = tf.keras.layers.Flatten()(y_pred) 108 | intersection = tf.reduce_sum(y_true * y_pred) 109 | return (2. * intersection + smooth) / (tf.reduce_sum(y_true) + tf.reduce_sum(y_pred) + smooth) 110 | 111 | def dice_loss(y_true, y_pred): 112 | return 1.0 - dice_coef(y_true, y_pred) 113 | 114 | def get_train_data(train_files_path,validation_files_path): 115 | 116 | train_x = sorted(glob(os.path.join(train_files_path, "images/*"))) 117 | train_y = sorted(glob(os.path.join(train_files_path, "masks/*"))) 118 | 119 | valid_x = sorted(glob(os.path.join(validation_files_path, "images/*"))) 120 | valid_y = sorted(glob(os.path.join(validation_files_path, "masks/*"))) 121 | 122 | 123 | 124 | return train_x,train_y,valid_x,valid_y 125 | 126 | 127 | if __name__ == "__main__": 128 | 129 | args, _ = parse_args() 130 | EPOCHS = args.epochs 131 | BATCH = args.batch_size 132 | LR = args.learning_rate 133 | 134 | train_x,train_y,valid_x,valid_y = get_train_data(args.train,args.validation) 135 | train_dataset = tf_dataset(train_x, train_y, batch=BATCH) 136 | valid_dataset = tf_dataset(valid_x, valid_y, batch=BATCH) 137 | print(train_dataset) 138 | 139 | 140 | device = '/cpu:0' 141 | print(device) 142 | batch_size = args.batch_size 143 | epochs = args.epochs 144 | learning_rate = args.learning_rate 145 | print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate)) 146 | 147 | with tf.device(device): 148 | 149 | model = model() 150 | opt = tf.keras.optimizers.Nadam(LR) 151 | metrics = [dice_coef, Recall(), Precision()] 152 | model.compile(loss=dice_loss, optimizer=opt, metrics=metrics) 153 | 154 | callbacks = [ 155 | ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4), 156 | EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=False) 157 | ] 158 | 159 | train_steps = len(train_x)//BATCH 160 | valid_steps = len(valid_x)//BATCH 161 | 162 | if len(train_x) % BATCH != 0: 163 | train_steps += 1 164 | if len(valid_x) % BATCH != 0: 165 | valid_steps += 1 166 | model.fit( 167 | train_dataset, 168 | validation_data=valid_dataset, 169 | epochs=EPOCHS, 170 | steps_per_epoch=train_steps, 171 | validation_steps=valid_steps, 172 | callbacks=callbacks 173 | ) 174 | # evaluate on train set 175 | scores = model.evaluate(train_dataset,steps=train_steps) 176 | print("\ntrain bce :", scores) 177 | 178 | # evaluate on val set 179 | scores = model.evaluate(valid_dataset,steps=valid_steps) 180 | print("\nval bce :", scores) 181 | 182 | # save model 183 | #model.save(args.model_dir + '/1') 184 | 185 | #Save as .h5, neo supports only .h5 format for keras , set 'include_optimizer=False' to remove operators that do not compile 186 | filepath=args.model_dir + '/unet_mobilenetv2.h5' 187 | tf.keras.models.save_model( 188 | model, filepath, overwrite=True, include_optimizer=False, save_format='h5'#, 189 | #signatures=None, options=None, save_traces=True 190 | ) 191 | -------------------------------------------------------------------------------- /src/cloud/pipelines/semantic_segmentation/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import os 4 | import numpy as np 5 | import boto3 6 | import time 7 | import sagemaker 8 | import sagemaker.session 9 | 10 | from sagemaker.workflow.parameters import ParameterInteger, ParameterString 11 | from sagemaker.sklearn.processing import SKLearnProcessor 12 | from sagemaker.processing import ProcessingInput, ProcessingOutput 13 | from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig 14 | from sagemaker.workflow.properties import PropertyFile 15 | from sagemaker.inputs import TrainingInput 16 | from sagemaker.workflow.step_collections import RegisterModel 17 | from sagemaker.workflow.pipeline import Pipeline 18 | 19 | BASE_DIR = os.path.dirname(os.path.realpath(__file__)) 20 | 21 | def get_session(region, default_bucket): 22 | """Gets the sagemaker session based on the region. 23 | 24 | Args: 25 | region: the aws region to start the session 26 | default_bucket: the bucket to use for storing the artifacts 27 | 28 | Returns: 29 | `sagemaker.session.Session instance 30 | """ 31 | 32 | boto_session = boto3.Session(region_name=region) 33 | 34 | sagemaker_client = boto_session.client("sagemaker") 35 | runtime_client = boto_session.client("sagemaker-runtime") 36 | return sagemaker.session.Session( 37 | boto_session=boto_session, 38 | sagemaker_client=sagemaker_client, 39 | sagemaker_runtime_client=runtime_client, 40 | default_bucket=default_bucket, 41 | ) 42 | 43 | def get_pipeline( 44 | region, 45 | role=None, 46 | default_bucket=None, 47 | pipeline_name="defect-detection-semantic-segmentation-pipeline", 48 | base_job_prefix="defect-detection-semantic-segmentation", 49 | ): 50 | """Gets a SageMaker ML Pipeline instance working with on DefectDetection data. 51 | 52 | Args: 53 | region: AWS region to create and run the pipeline. 54 | role: IAM role to create and run steps and pipeline. 55 | default_bucket: the bucket to use for storing the artifacts 56 | 57 | Returns: 58 | an instance of a pipeline 59 | """ 60 | sagemaker_session = get_session(region, default_bucket) 61 | if role is None: 62 | role = sagemaker.session.get_execution_role(sagemaker_session) 63 | 64 | ## By enabling cache, if you run this pipeline again, without changing the input 65 | ## parameters it will skip the training part and reuse the previous trained model 66 | cache_config = CacheConfig(enable_caching=True, expire_after="30d") 67 | ts = time.strftime('%Y-%m-%d-%H-%M-%S') 68 | 69 | # Data prep 70 | processing_instance_type = ParameterString( # instance type for data preparation 71 | name="ProcessingInstanceType", 72 | default_value="ml.m5.xlarge" 73 | ) 74 | processing_instance_count = ParameterInteger( # number of instances used for data preparation 75 | name="ProcessingInstanceCount", 76 | default_value=1 77 | ) 78 | 79 | # Training 80 | training_instance_type = ParameterString( # instance type for training the model 81 | name="TrainingInstanceType", 82 | default_value="ml.c5.xlarge" 83 | ) 84 | training_instance_count = ParameterInteger( # number of instances used to train your model 85 | name="TrainingInstanceCount", 86 | default_value=1 87 | ) 88 | training_epochs = ParameterString( 89 | name="TrainingEpochs", 90 | default_value="100" 91 | ) 92 | 93 | # Dataset input data: S3 path 94 | input_data = ParameterString( 95 | name="InputData", 96 | default_value="", 97 | ) 98 | 99 | # Model Approval State 100 | model_approval_status = ParameterString( 101 | name="ModelApprovalStatus", 102 | default_value="PendingManualApproval" 103 | ) 104 | 105 | # Model package group name for registering in model registry 106 | model_package_group_name = ParameterString( 107 | name="ModelPackageGroupName", 108 | default_value="defect-detection-semantic-segmentation-model-group" 109 | ) 110 | 111 | # The preprocessor 112 | preprocessor = SKLearnProcessor( 113 | framework_version="0.23-1", 114 | role=role, 115 | instance_type=processing_instance_type, 116 | instance_count=processing_instance_count, 117 | max_runtime_in_seconds=7200, 118 | ) 119 | 120 | # A preprocessing report to store some information from the preprocessing step for next steps 121 | preprocessing_report = PropertyFile( 122 | name='PreprocessingReport', 123 | output_name='preprocessing_report', 124 | path='preprocessing_report.json' 125 | ) 126 | 127 | # Preprocessing Step 128 | step_process = ProcessingStep( 129 | name="DefectDetectionPreprocessing", 130 | code=os.path.join(BASE_DIR, 'preprocessing.py'), ## this is the script defined above 131 | processor=preprocessor, 132 | inputs=[ 133 | ProcessingInput(source=input_data, destination='/opt/ml/processing/input') 134 | ], 135 | outputs=[ 136 | ProcessingOutput(output_name='train_data', source='/opt/ml/processing/train'), 137 | ProcessingOutput(output_name='test_data', source='/opt/ml/processing/test'), 138 | ProcessingOutput(output_name='val_data', source='/opt/ml/processing/val'), 139 | ProcessingOutput(output_name='preprocessing_report', source='/opt/ml/processing/report') 140 | ], 141 | job_arguments=['--split', '0.1'], 142 | property_files=[preprocessing_report] 143 | ) 144 | 145 | from sagemaker.tensorflow import TensorFlow 146 | model_dir = '/opt/ml/model' 147 | hyperparameters = {'epochs': training_epochs, 'batch_size': 8, 'learning_rate': 0.0001} 148 | estimator = TensorFlow(source_dir=BASE_DIR, 149 | entry_point='train_tf.py', 150 | model_dir=model_dir, 151 | instance_type=training_instance_type, 152 | #instance_type='local', 153 | instance_count=training_instance_count, 154 | hyperparameters=hyperparameters, 155 | role=role, 156 | output_path='s3://{}/{}/{}/{}'.format(default_bucket, 'models', base_job_prefix, 'training-output'), 157 | framework_version='2.2.0', 158 | py_version='py37', 159 | script_mode=True 160 | ) 161 | 162 | step_train = TrainingStep( 163 | name="DefectDetectionSemanticSegmentationTrain", 164 | estimator=estimator, 165 | inputs={ 166 | "train": TrainingInput( 167 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri, 168 | content_type='image/png', 169 | s3_data_type='S3Prefix' 170 | ), 171 | "validation": TrainingInput( 172 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["val_data"].S3Output.S3Uri, 173 | content_type='image/png', 174 | s3_data_type='S3Prefix' 175 | ) 176 | }, 177 | cache_config=cache_config 178 | ) 179 | 180 | # Register model step that will be conditionally executed 181 | step_register = RegisterModel( 182 | name="DefectDetectionSemanticSegmentationRegister", 183 | estimator=estimator, 184 | model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, 185 | content_types=["image/png"], 186 | response_types=["application/json"], 187 | inference_instances=["ml.c5.2xlarge", "ml.p3.2xlarge"], 188 | transform_instances=["ml.c5.xlarge"], 189 | model_package_group_name=model_package_group_name, 190 | approval_status=model_approval_status 191 | ) 192 | 193 | pipeline = Pipeline( 194 | name=pipeline_name, 195 | parameters=[ 196 | processing_instance_type, 197 | processing_instance_count, 198 | training_instance_type, 199 | training_instance_count, 200 | training_epochs, 201 | input_data, 202 | model_approval_status, 203 | model_package_group_name 204 | ], 205 | steps=[step_process, step_train, step_register], 206 | sagemaker_session=sagemaker_session, 207 | ) 208 | return pipeline 209 | -------------------------------------------------------------------------------- /src/cloud/pipelines/image_classification/preprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import sys 4 | import os 5 | import subprocess 6 | 7 | # Install packages previous to executing the rest of the script. You can also build your own custom container 8 | # with your individal dependecies if needed 9 | subprocess.check_call([sys.executable, "-m", "pip", "install", "Augmentor", "wget", "mxnet", "opencv-python"]) 10 | os.system("apt-get update -y") 11 | os.system("apt-get install ffmpeg libsm6 libxext6 -y") 12 | 13 | import argparse 14 | import json 15 | import warnings 16 | import pandas as pd 17 | import numpy as np 18 | from glob import glob 19 | from datetime import datetime 20 | import shutil 21 | import wget 22 | from PIL import Image 23 | import Augmentor 24 | 25 | from sklearn.model_selection import train_test_split 26 | 27 | 28 | # Constants 29 | 30 | # the "folders" in the S3 bucket which define which images are good or bad 31 | PREFIX_NAME_NORMAL = 'normal' 32 | PREFIX_NAME_ANOMALOUS = 'anomalous' 33 | 34 | 35 | # Download im2rec.py tool for RecordIO conversion 36 | filename_im2rec_tool = wget.download("https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py") 37 | 38 | def augment_data(path, sample_count): 39 | """Augments the image dataset in the given path by adding rotation, zoom, 40 | brightness, contrast to the dataset""" 41 | p = Augmentor.Pipeline(path, output_directory=path) 42 | 43 | # Define augmentation operations 44 | #p.rotate(probability=0.4, max_left_rotation=8, max_right_rotation=8) 45 | #p.zoom(probability=0.3, min_factor=1.1, max_factor=1.3) 46 | p.random_brightness(probability=0.3, min_factor=0.4, max_factor=0.9) 47 | p.random_contrast(probability=0.2, min_factor=0.9, max_factor=1.1) 48 | 49 | p.sample(sample_count) 50 | 51 | 52 | def split_dataset(path, split=0.1): 53 | """Split the images into train-test-validation and move them into separate folder each (named train, test, val)""" 54 | 55 | label_map = { 'good': 0, 'bad': 1 } 56 | bad = sorted(glob(os.path.join(path, "%s/*" % PREFIX_NAME_ANOMALOUS))) 57 | good = sorted(glob(os.path.join(path, "%s/*" % PREFIX_NAME_NORMAL))) 58 | 59 | images = bad + good 60 | labels = ([label_map['bad']] * len(bad)) + ([label_map['good']] * len(good)) 61 | 62 | total_size = len(images) 63 | valid_size = int(split * total_size) 64 | test_size = int(split * total_size) 65 | print('Total number of samples (normal and anomalous):', total_size) 66 | 67 | train_x, valid_x = train_test_split(images, test_size=valid_size, random_state=42) 68 | train_y, valid_y = train_test_split(labels, test_size=valid_size, random_state=42) 69 | 70 | train_x, test_x = train_test_split(train_x, test_size=test_size, random_state=42) 71 | train_y, test_y = train_test_split(train_y, test_size=test_size, random_state=42) 72 | 73 | return (train_x, train_y), (valid_x, valid_y), (test_x, test_y) 74 | 75 | def resize_images(path, width, height): 76 | """Resize all images in a given path (in-place). Please note that this method 77 | overwrites existing images in the path""" 78 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg')) 79 | for file in files: 80 | im = Image.open(file) 81 | im_resized = im.resize((width, height), Image.ANTIALIAS) 82 | im_resized.save(file) 83 | 84 | 85 | def get_square_image(img): 86 | """Returns a squared image by adding black padding""" 87 | padding_color = (0, 0, 0) 88 | width, height = img.size 89 | if width == height: 90 | return img 91 | elif width > height: 92 | result = Image.new(img.mode, (width, width), padding_color) 93 | result.paste(img, (0, (width - height) // 2)) 94 | return result 95 | else: 96 | result = Image.new(img.mode, (height, height), padding_color) 97 | result.paste(img, ((height - width) // 2, 0)) 98 | return result 99 | 100 | def square_images(path): 101 | """Squares all images in a given path (in-place). Please note that this 102 | method overwrites existing images in the path.""" 103 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg')) 104 | for file in files: 105 | im = Image.open(file) 106 | im_squared = get_square_image(im) 107 | im_squared.save(file) 108 | 109 | 110 | if __name__=='__main__': 111 | parser = argparse.ArgumentParser() 112 | parser.add_argument('--augment-count-normal', type=int, default=0) 113 | parser.add_argument('--augment-count-anomalous', type=int, default=0) 114 | parser.add_argument('--image-width', type=int, default=224) 115 | parser.add_argument('--image-height', type=int, default=224) 116 | parser.add_argument('--split', type=float, default=0.1) 117 | args, _ = parser.parse_known_args() 118 | 119 | print('Received arguments {}'.format(args)) 120 | 121 | # Define the paths 122 | input_data_base_path = '/opt/ml/processing/input' 123 | train_output_base_path = '/opt/ml/processing/train' 124 | test_output_base_path = '/opt/ml/processing/test' 125 | val_output_base_path = '/opt/ml/processing/val' 126 | report_output_base_path = '/opt/ml/processing/report' 127 | temp_data_base_path = 'opt/ml/processing/tmp' 128 | 129 | input_path_normal = os.path.join(input_data_base_path, PREFIX_NAME_NORMAL) 130 | input_path_anomalous = os.path.join(input_data_base_path, PREFIX_NAME_ANOMALOUS) 131 | 132 | # The images size used 133 | IMAGE_WIDTH = int(args.image_width) 134 | IMAGE_HEIGHT = int(args.image_height) 135 | 136 | # Augment images if needed 137 | # TODO: Only augment training images, not entire dataset! 138 | print('Augmenting images...') 139 | augment_data(input_path_normal, int(args.augment_count_normal)) 140 | augment_data(input_path_anomalous, int(args.augment_count_anomalous)) 141 | 142 | # Square all the images to ensure that only squared images exist in the training datset by adding a black padding around the image 143 | # IMPORTANT: Make sure you do the same when running inference 144 | print('Squaring all images that are not squared already...') 145 | square_images(input_path_normal) 146 | square_images(input_path_anomalous) 147 | 148 | # Resize the images in-place in the container image 149 | print('Resizing images...') 150 | resize_images(input_path_normal, IMAGE_WIDTH, IMAGE_HEIGHT) 151 | resize_images(input_path_anomalous, IMAGE_WIDTH, IMAGE_HEIGHT) 152 | 153 | # Create train test validation split 154 | # FIXME: only augment train dataset, not the test dataset! 155 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = split_dataset(input_data_base_path, split=float(args.split)) 156 | 157 | 158 | # Create list files for RecordIO transformation 159 | base_dir_recordio = './' 160 | 161 | with open(base_dir_recordio+'train.lst', 'w+') as f: 162 | for indx, s in enumerate(train_x): 163 | f.write(f'{indx}\t{train_y[indx]}\t{s}\n') 164 | 165 | with open(base_dir_recordio+'val.lst', 'w+') as f: 166 | for indx, s in enumerate(valid_x): 167 | f.write(f'{indx}\t{valid_y[indx]}\t{s}\n') 168 | 169 | with open(base_dir_recordio+'test.lst', 'w+') as f: 170 | for indx, s in enumerate(test_x): 171 | f.write(f'{indx}\t{test_y[indx]}\t{s}\n') 172 | 173 | # Run im2rec.py file to convert to RecordIO 174 | print('Running im2rec.py tool for recordio conversion') 175 | os.system('python3 ./im2rec.py train.lst ./') 176 | os.system('python3 ./im2rec.py val.lst ./') 177 | os.system('python3 ./im2rec.py test.lst ./') 178 | 179 | # Copy to the output paths 180 | shutil.copy('train.rec', os.path.join(train_output_base_path, 'train.rec')) 181 | shutil.copy('val.rec', os.path.join(val_output_base_path, 'val.rec')) 182 | shutil.copy('test.rec', os.path.join(test_output_base_path, 'test.rec')) 183 | 184 | # Save the preprocessing report to make information available to downstream steps 185 | preprocessing_report = { 186 | 'preprocessing': { 187 | 'dataset': { 188 | 'num_training_samples': len(train_x), 189 | 'num_test_samples': len(test_x), 190 | 'num_val_samples': len(valid_x) 191 | } 192 | } 193 | } 194 | print('Preprocessing report:', preprocessing_report) 195 | report_output_path = os.path.join(report_output_base_path, 'preprocessing_report.json') 196 | with open(report_output_path, "w") as f: 197 | f.write(json.dumps(preprocessing_report)) 198 | 199 | -------------------------------------------------------------------------------- /src/edge/app/ota.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | # Source code partially modified from: https://github.com/aws-samples/amazon-sagemaker-edge-manager-demo/blob/main/04_EdgeApplication/turbine/ota.py 4 | 5 | import ssl 6 | import paho.mqtt.client as mqtt 7 | import logging 8 | import json 9 | import os 10 | import io 11 | import time 12 | import requests 13 | import boto3 14 | import tarfile 15 | import glob 16 | import threading 17 | import app 18 | 19 | class OTAModelUpdate(object): 20 | def __init__(self, device_name, iot_params, mqtt_host, mqtt_port, update_callback, model_path, models_supported): 21 | ''' 22 | This class is responsible for listening to IoT topics and receiving 23 | a Json document with the metadata of a new model. This module also 24 | downloads the SageMaker Edge Manager deployment package, unpacks it to 25 | a local dir and also controls versioning. 26 | ''' 27 | if model_path is None or update_callback is None: 28 | raise Exception("You need to inform a model_path and an update_callback methods") 29 | self.device_name = device_name 30 | self.model_path = model_path 31 | self.update_callback = update_callback 32 | self.iot_params = iot_params 33 | self.models_supported = models_supported 34 | 35 | ## initialize an mqtt client 36 | self.mqttc = mqtt.Client() 37 | self.mqttc.tls_set( 38 | iot_params['sagemaker_edge_provider_aws_ca_cert_file'], 39 | certfile=iot_params['sagemaker_edge_provider_aws_cert_file'], 40 | keyfile=iot_params['sagemaker_edge_provider_aws_cert_pk_file'], 41 | cert_reqs=ssl.CERT_REQUIRED, tls_version=ssl.PROTOCOL_TLSv1_2, ciphers=None 42 | ) 43 | self.mqttc.enable_logger(logger=logging) 44 | self.mqttc.on_message = self.__on_message__ 45 | self.mqttc.on_connect = self.__on_connect__ 46 | self.mqttc.on_disconnect = self.__on_disconnect__ 47 | self.connected = False 48 | 49 | self.processing_lock = threading.Lock() 50 | self.processed_jobs = [] 51 | 52 | # start the mqtt client 53 | self.mqttc.connect(mqtt_host, mqtt_port, 45) 54 | self.mqttc.loop_start() 55 | 56 | def model_update_check(self): 57 | ''' 58 | Check manually if there is a new model available 59 | ''' 60 | if self.connected: 61 | self.mqttc.publish('$aws/things/%s/jobs/get' % self.device_name) 62 | 63 | def __on_message__(self, client, userdata, message): 64 | ''' 65 | This callback is invoked by MQTTC each time a new message is published 66 | to one of the subscribed topics 67 | ''' 68 | logging.debug("New message. Topic: %s; Message: %s;" % (message.topic, message.payload)) 69 | 70 | if message.topic.endswith('notify'): 71 | self.model_update_check() 72 | 73 | elif message.topic.endswith('accepted'): 74 | resp = json.loads(message.payload) 75 | logging.debug(resp) 76 | if resp.get('queuedJobs') is not None: # request to list jobs 77 | # get the description of each queued job 78 | for j in resp['queuedJobs']: 79 | ## get the job description 80 | self.mqttc.publish('$aws/things/%s/jobs/%s/get' % ( self.device_name, j['jobId'] ) ) 81 | break 82 | elif resp.get('inProgressJobs') is not None: # request to list jobs 83 | # get the description of each queued job 84 | for j in resp['inProgressJobs']: 85 | ## get the job description 86 | self.mqttc.publish('$aws/things/%s/jobs/%s/get' % ( self.device_name, j['jobId'] ) ) 87 | break 88 | elif resp.get('execution') is not None: # request to get job description 89 | # check if this is a job description message 90 | job_meta = resp.get('execution') 91 | 92 | # we have the job metadata, let's process it 93 | self.__update_job_status__(job_meta['jobId'], 'IN_PROGRESS', 'Trying to get/load the model') 94 | self.__process_job__(job_meta['jobId'], job_meta['jobDocument']) 95 | else: 96 | logging.debug('Other message: ', resp) 97 | 98 | def __on_connect__(self, client, userdata, flags, rc): 99 | ''' 100 | This callback is invoked just after MQTTC managed to connect 101 | to the MQTT endpoint 102 | ''' 103 | self.connected = True 104 | logging.info("OTA Model Manager Connected to the MQTT endpoint!") 105 | self.mqttc.subscribe('$aws/things/%s/jobs/notify' % self.device_name) 106 | self.mqttc.subscribe('$aws/things/%s/jobs/accepted' % self.device_name) 107 | self.mqttc.subscribe('$aws/things/%s/jobs/rejected' % self.device_name) 108 | time.sleep(1) 109 | self.model_update_check() 110 | 111 | def __on_disconnect__(self, client, userdata, flags): 112 | ''' 113 | This callback is invoked when MQTTC disconnected from the MQTT endpoint 114 | ''' 115 | self.connected = False 116 | logging.info("OTA Model Manager Disconnected!") 117 | 118 | def __del__(self): 119 | ''' 120 | Object destructor 121 | ''' 122 | logging.info("OTA Model Manager Deleting this object") 123 | self.mqttc.loop_stop() 124 | self.mqttc.disconnect() 125 | 126 | def __update_job_status__(self, job_id, status, details): 127 | ''' 128 | After receiving a new signal that there is a model to be deployed 129 | Update the IoT Job to inform the user the current status of this 130 | process 131 | ''' 132 | payload = json.dumps({ 133 | "status": status, 134 | "statusDetails": {"info": details }, 135 | "includeJobExecutionState": False, 136 | "includeJobDocument": False, 137 | "stepTimeoutInMinutes": 2, 138 | }) 139 | logging.info("Updating IoT job status: %s" % details) 140 | self.mqttc.publish('$aws/things/%s/jobs/%s/update' % ( self.device_name, job_id), payload) 141 | 142 | 143 | def __process_job__(self, job_id, msg): 144 | ''' 145 | This method is responsible for: 146 | 1. validate the new model version 147 | 2. download the model package 148 | 3. unpack it to a local dir 149 | 4. notify the main application 150 | ''' 151 | self.processing_lock.acquire() 152 | if job_id in self.processed_jobs: 153 | self.processing_lock.release() 154 | return 155 | self.processed_jobs.append(job_id) 156 | try: 157 | if msg.get('type') == 'new_model': 158 | model_version = msg['model_version'] 159 | model_name = msg['model_name'] 160 | 161 | # Check if the application supports the model with the name incoming 162 | if model_name not in self.models_supported: 163 | msg = 'New model %s from incoming deployment is not in list of supported models. Skipping deployment.' % model_name 164 | logging.info(msg) 165 | self.__update_job_status__(job_id, 'FAILED', msg) 166 | self.processing_lock.release() 167 | return 168 | 169 | logging.info("Downloading new model package") 170 | s3_client = app.get_client('s3', self.iot_params) 171 | 172 | package = io.BytesIO(s3_client.get_object( 173 | Bucket=msg['model_package_bucket'], 174 | Key=msg['model_package_key'])['Body'].read() 175 | ) 176 | logging.info("Unpacking model package") 177 | with tarfile.open(fileobj=package) as p: 178 | p.extractall(os.path.join(self.model_path, msg['model_name'], msg['model_version'])) 179 | 180 | self.__update_job_status__(job_id, 'SUCCEEDED', 'Model deployed') 181 | self.update_callback(model_name, model_version) 182 | else: 183 | logging.info("Model '%s' version '%f' is the current one or it is obsolete" % (self.model_metadata['model_name'], self.model_metadata['model_version'])) 184 | except Exception as e: 185 | self.__update_job_status__(job_id, 'FAILED', str(e)) 186 | logging.error(e) 187 | 188 | self.processing_lock.release() -------------------------------------------------------------------------------- /src/cloud/pipelines/semantic_segmentation/preprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import sys 4 | import os 5 | import subprocess 6 | 7 | # Install packages previous to executing the rest of the script. You can also build your own custom container 8 | # with your individual dependencies if needed 9 | subprocess.check_call([sys.executable, "-m", "pip", "install", "wget", "opencv-python","albumentations","tqdm"]) 10 | os.system("apt-get update") 11 | os.system("apt-get install ffmpeg libsm6 libxext6 -y") 12 | 13 | import argparse 14 | import json 15 | from glob import glob 16 | import shutil 17 | from PIL import Image 18 | from pathlib import Path 19 | 20 | import cv2 21 | from tqdm import tqdm 22 | from albumentations import CenterCrop, RandomRotate90, GridDistortion, HorizontalFlip, VerticalFlip 23 | 24 | from sklearn.model_selection import train_test_split 25 | 26 | 27 | # Constants 28 | 29 | # the "folders" in the S3 bucket for images and their ground truth masks 30 | PREFIX_NAME_IMAGE = 'images' 31 | PREFIX_NAME_MASK = 'masks' 32 | 33 | # The images size used 34 | IMAGE_WIDTH = 224 35 | IMAGE_HEIGHT = 224 36 | 37 | def augment_data(path, augment=True): 38 | save_path = path 39 | images = sorted(glob(os.path.join(path, PREFIX_NAME_IMAGE + "/*"))) 40 | masks = sorted(glob(os.path.join(path, PREFIX_NAME_MASK + "/*"))) 41 | 42 | for x, y in tqdm(zip(images, masks), total=len(images)): 43 | name = x.split("/")[-1].split(".") 44 | 45 | img_name = name[0] 46 | image_extn = name[1] 47 | 48 | name = y.split("/")[-1].split(".") 49 | mask_name = name[0] 50 | mask_extn = name[1] 51 | 52 | # Read image mask 53 | x = cv2.imread(x, cv2.IMREAD_COLOR) 54 | y = cv2.imread(y, cv2.IMREAD_COLOR) 55 | 56 | # Augment dataset 57 | if augment == True: 58 | aug = RandomRotate90(p=1.0) 59 | augmented = aug(image=x, mask=y) 60 | x1 = augmented['image'] 61 | y1 = augmented['mask'] 62 | 63 | aug = RandomRotate90(p=1.0) 64 | augmented = aug(image=x, mask=y) 65 | x2 = augmented['image'] 66 | y2 = augmented['mask'] 67 | 68 | aug = GridDistortion(p=1.0) 69 | augmented = aug(image=x, mask=y) 70 | x3 = augmented['image'] 71 | y3 = augmented['mask'] 72 | 73 | aug = HorizontalFlip(p=1.0) 74 | augmented = aug(image=x, mask=y) 75 | x4 = augmented['image'] 76 | y4 = augmented['mask'] 77 | 78 | aug = VerticalFlip(p=1.0) 79 | augmented = aug(image=x, mask=y) 80 | x5 = augmented['image'] 81 | y5 = augmented['mask'] 82 | 83 | save_images = [x, x1, x2, x3, x4, x5] 84 | save_masks = [y, y1, y2, y3, y4, y5] 85 | 86 | else: 87 | save_images = [x] 88 | save_masks = [y] 89 | 90 | """ Saving the image and mask. """ 91 | idx = 0 92 | Path(save_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True) 93 | Path(save_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True) 94 | for i, m in zip(save_images, save_masks): 95 | i = cv2.resize(i, (IMAGE_WIDTH, IMAGE_HEIGHT)) 96 | m = cv2.resize(m, (IMAGE_WIDTH, IMAGE_HEIGHT)) 97 | 98 | if len(images) == 1: 99 | tmp_img_name = f"{img_name}.{image_extn}" 100 | tmp_mask_name = f"{mask_name}.{mask_extn}" 101 | else: 102 | tmp_img_name = f"{img_name}_{idx}.{image_extn}" 103 | tmp_mask_name = f"{mask_name}_{idx}.{mask_extn}" 104 | 105 | image_path = os.path.join(save_path, PREFIX_NAME_IMAGE, tmp_img_name) 106 | mask_path = os.path.join(save_path, PREFIX_NAME_MASK, tmp_mask_name) 107 | 108 | cv2.imwrite(image_path, i) 109 | cv2.imwrite(mask_path, m) 110 | 111 | idx += 1 112 | 113 | 114 | def resize_images(path, width, height): 115 | """Resize all images in a given path (in-place). Please note that this method 116 | overwrites existing images in the path""" 117 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg')) 118 | for file in files: 119 | im = Image.open(file) 120 | im_resized = im.resize((width, height), Image.ANTIALIAS) 121 | im_resized.save(file) 122 | 123 | def get_square_image(img, padding_color=(0, 0, 0)): 124 | """Returns a squared image by adding black padding""" 125 | width, height = img.size 126 | if width == height: 127 | return img 128 | elif width > height: 129 | result = Image.new(img.mode, (width, width), padding_color) 130 | result.paste(img, (0, (width - height) // 2)) 131 | return result 132 | else: 133 | result = Image.new(img.mode, (height, height), padding_color) 134 | result.paste(img, ((height - width) // 2, 0)) 135 | return result 136 | 137 | def square_images(path, padding_color=(0,0,0)): 138 | """Squares all images in a given path (in-place). Please note that this 139 | method overwrites existing images in the path.""" 140 | files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg')) 141 | for file in files: 142 | im = Image.open(file) 143 | im_squared = get_square_image(im, padding_color) 144 | im_squared.save(file) 145 | 146 | def load_data(path, split=0.1): 147 | images = sorted(glob(os.path.join(path, PREFIX_NAME_IMAGE + "/*"))) 148 | masks = sorted(glob(os.path.join(path, PREFIX_NAME_MASK + "/*"))) 149 | 150 | total_size = len(images) 151 | valid_size = int(split * total_size) 152 | test_size = int(split * total_size) 153 | print(total_size) 154 | train_x, valid_x = train_test_split(images, test_size=valid_size, random_state=42) 155 | train_y, valid_y = train_test_split(masks, test_size=valid_size, random_state=42) 156 | 157 | train_x, test_x = train_test_split(train_x, test_size=test_size, random_state=42) 158 | train_y, test_y = train_test_split(train_y, test_size=test_size, random_state=42) 159 | 160 | return (train_x, train_y), (valid_x, valid_y), (test_x, test_y) 161 | 162 | 163 | if __name__=='__main__': 164 | parser = argparse.ArgumentParser() 165 | parser.add_argument('--split', type=float, default=0.1) 166 | args, _ = parser.parse_known_args() 167 | 168 | print('Received arguments {}'.format(args)) 169 | 170 | # Define the paths 171 | input_data_base_path = '/opt/ml/processing/input' 172 | train_output_base_path = '/opt/ml/processing/train' 173 | test_output_base_path = '/opt/ml/processing/test' 174 | val_output_base_path = '/opt/ml/processing/val' 175 | report_output_base_path = '/opt/ml/processing/report' 176 | 177 | #Augment images and save in new directory 178 | augment_data(input_data_base_path) 179 | 180 | print('Squaring images...') 181 | square_images(os.path.join(input_data_base_path, PREFIX_NAME_IMAGE)) 182 | square_images(os.path.join(input_data_base_path, PREFIX_NAME_MASK), padding_color=(0)) 183 | 184 | # Resize the images in-place in the container image 185 | print('Resizing images...') 186 | resize_images(os.path.join(input_data_base_path, PREFIX_NAME_IMAGE), IMAGE_WIDTH, IMAGE_HEIGHT) 187 | resize_images(os.path.join(input_data_base_path, PREFIX_NAME_MASK), IMAGE_WIDTH, IMAGE_HEIGHT) 188 | 189 | # Create train test validation split 190 | (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_data(input_data_base_path, split=float(args.split)) 191 | 192 | # Copy to the output paths 193 | Path(train_output_base_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True) 194 | Path(train_output_base_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True) 195 | Path(val_output_base_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True) 196 | Path(val_output_base_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True) 197 | Path(test_output_base_path + "/" + PREFIX_NAME_IMAGE ).mkdir(parents=True, exist_ok=True) 198 | Path(test_output_base_path + "/" + PREFIX_NAME_MASK ).mkdir(parents=True, exist_ok=True) 199 | for file in train_x : 200 | shutil.copy(file, os.path.join(train_output_base_path, PREFIX_NAME_IMAGE + '/' + os.path.basename(file))) 201 | for file in train_y : 202 | shutil.copy(file, os.path.join(train_output_base_path, PREFIX_NAME_MASK + '/'+ os.path.basename(file))) 203 | for file in valid_x : 204 | shutil.copy(file, os.path.join(val_output_base_path, PREFIX_NAME_IMAGE + '/'+ os.path.basename(file))) 205 | for file in valid_y : 206 | shutil.copy(file, os.path.join(val_output_base_path, PREFIX_NAME_MASK + '/'+ os.path.basename(file))) 207 | for file in test_x : 208 | shutil.copy(file, os.path.join(test_output_base_path, PREFIX_NAME_IMAGE + '/'+ os.path.basename(file))) 209 | for file in test_y : 210 | shutil.copy(file, os.path.join(test_output_base_path, PREFIX_NAME_MASK + '/'+ os.path.basename(file))) 211 | # Save the preprocessing report to make information available to downstream steps 212 | preprocessing_report = { 213 | 'preprocessing': { 214 | 'dataset': { 215 | 'num_training_samples': len(train_x), 216 | 'num_test_samples': len(test_x), 217 | 'num_val_samples': len(valid_x) 218 | } 219 | } 220 | } 221 | print('Preprocessing report:', preprocessing_report) 222 | report_output_path = os.path.join(report_output_base_path, 'preprocessing_report.json') 223 | with open(report_output_path, "w") as f: 224 | f.write(json.dumps(preprocessing_report)) 225 | 226 | 227 | -------------------------------------------------------------------------------- /src/edge/run.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import os 4 | import numpy as np 5 | import json 6 | import logging 7 | import PIL.Image 8 | import glob 9 | import random 10 | import re 11 | from timeit import default_timer as timer 12 | 13 | from flask import Flask 14 | from flask import render_template 15 | from waitress import serve 16 | flask_app = Flask(__name__) 17 | 18 | import app 19 | 20 | # Get environment variables 21 | if not 'SM_EDGE_AGENT_HOME' in os.environ: 22 | logging.error('You need to define the environment variable SM_EDGE_AGENT_HOME') 23 | raise Exception('Environment variable not defined') 24 | 25 | if not 'SM_APP_ENV' in os.environ: 26 | logging.error('You need to define the environment variable SM_APP_ENV as either "prod" or "dev"') 27 | raise Exception('Environment variable not defined') 28 | 29 | # Configuration constants 30 | SM_EDGE_AGENT_HOME = os.environ['SM_EDGE_AGENT_HOME'] 31 | AGENT_SOCKET = '/tmp/edge_agent' 32 | SM_EDGE_MODEL_PATH = os.path.join(SM_EDGE_AGENT_HOME, 'model/dev') 33 | SM_EDGE_CONFIGFILE_PATH = os.path.join(SM_EDGE_AGENT_HOME, 'conf/config_edge_device.json') 34 | CONFIG_FILE_PATH = './models_config.json' 35 | SM_APP_ENV = os.environ['SM_APP_ENV'] 36 | IMG_WIDTH = 224 37 | IMG_HEIGHT = 224 38 | 39 | # Set up logging 40 | logging.basicConfig(level=logging.INFO) 41 | logging.debug('Initializing...') 42 | 43 | # Loading config file 44 | with open(CONFIG_FILE_PATH, 'r') as f: 45 | config = json.load(f) 46 | 47 | # Load SM Edge Agent config file 48 | iot_params = json.loads(open(SM_EDGE_CONFIGFILE_PATH, 'r').read()) 49 | 50 | # Retrieve the IoT thing name associated with the edge device 51 | iot_client = app.get_client('iot', iot_params) 52 | sm_client = app.get_client('sagemaker', iot_params) 53 | 54 | resp = sm_client.describe_device( 55 | DeviceName=iot_params['sagemaker_edge_core_device_name'], 56 | DeviceFleetName=iot_params['sagemaker_edge_core_device_fleet_name'] 57 | ) 58 | device_name = resp['IotThingName'] 59 | mqtt_host = iot_client.describe_endpoint(endpointType='iot:Data-ATS')['endpointAddress'] 60 | mqtt_port = 8883 61 | 62 | # Send logs to cloud via MQTT topics 63 | logger = app.Logger(device_name, iot_params) 64 | 65 | # Initialize the Edge Manager agent 66 | edge_agent = app.EdgeAgentClient(AGENT_SOCKET) 67 | 68 | # A list of names of loaded models with their name, version and identifier 69 | models_loaded = [] 70 | 71 | def create_model_identifier(name, version): 72 | """Get a compatible string as a combination of name and version""" 73 | new_name = "%s-%s" % (name, str(version).replace('.', '-')) 74 | return new_name 75 | 76 | def get_model_from_name(name, config_dict): 77 | """Returns the model dict from the config dict""" 78 | model_obj = next((x for x in config_dict['models'] if x['name'] == name), None) 79 | if model_obj is not None: 80 | return model_obj 81 | else: 82 | logging.warning('Model object not found in config') 83 | return None 84 | 85 | def get_square_image(img): 86 | """Returns a squared image by adding black padding""" 87 | padding_color = (0, 0, 0) 88 | width, height = img.size 89 | if width == height: 90 | return img 91 | elif width > height: 92 | result = PIL.Image.new(img.mode, (width, width), padding_color) 93 | result.paste(img, (0, (width - height) // 2)) 94 | return result 95 | else: 96 | result = PIL.Image.new(img.mode, (height, height), padding_color) 97 | result.paste(img, ((height - width) // 2, 0)) 98 | return result 99 | 100 | 101 | def preprocess_image(img, img_width, img_height): 102 | """Preprocesses the image before feeding it into the ML model""" 103 | x = get_square_image(img) 104 | x = np.asarray(img.resize((img_width, img_height))).astype(np.float32) 105 | x_transposed = x.transpose((2,0,1)) 106 | x_batchified = np.expand_dims(x_transposed, axis=0) 107 | return x_batchified 108 | 109 | # Setup model callback method 110 | def load_model(name, version): 111 | """Loads the model into the edge agent and unloads previous versions if any.""" 112 | global models_loaded 113 | version = str(version) 114 | # Create a model name string as a concatenation of name and version 115 | identifier = "%s-%s" % (name, version.replace('.', '-')) 116 | 117 | # Check if previous version of this model was loaded already and unload it 118 | matching_model_dict = next((m for m in models_loaded if m['name'] == name), None) 119 | if matching_model_dict: 120 | logging.info('Previous version of new model found: %s' % matching_model_dict) 121 | 122 | # Check if version is higher 123 | if float(version) <= float(matching_model_dict['version']): 124 | logging.info('New model version is not higher than previous version. Not loading incoming model.') 125 | return 126 | 127 | logging.info('Loading model into edge agent: %s' % identifier) 128 | resp = edge_agent.load_model(identifier, os.path.join(SM_EDGE_MODEL_PATH, name, version)) 129 | if resp is None: 130 | logging.error('It was not possible to load the model. Is the agent running?') 131 | return 132 | else: 133 | models_loaded.append({'name': name, 'version': version, 'identifier': identifier}) 134 | logging.info('Sucessfully loaded new model version into agent') 135 | if matching_model_dict: 136 | logging.info('Unloading previous model version') 137 | edge_agent.unload_model(matching_model_dict['identifier']) 138 | models_loaded.remove(matching_model_dict) 139 | 140 | def run_segmentation_inference(agent, filename): 141 | """Runs inference on the given image file. Returns prediction and model latency.""" 142 | 143 | # Check if model for segmentation is downloaded 144 | model_name_img_seg = config['mappings']['image-segmentation-app'] 145 | model_is_loaded = any([m['name']==model_name_img_seg for m in models_loaded]) 146 | if not model_is_loaded: 147 | logging.info('Model for image segmentation not loaded, waiting for deployment...') 148 | return None, None 149 | 150 | # Get the identifier of the currently loaded model 151 | model_dict_img_seg = next((x for x in models_loaded if x['name'] == model_name_img_seg), None) 152 | if not model_dict_img_seg: 153 | logging.info('Model for image segmentation not loaded, waiting for deployment...') 154 | return None, None 155 | model_id_img_seg = model_dict_img_seg['identifier'] 156 | 157 | logging.info('\nSegmentation inference with file %s and model %s' % (filename, model_id_img_seg)) 158 | image = PIL.Image.open(filename) 159 | image = image.convert(mode='RGB') 160 | 161 | # Preprocessing 162 | x_batchified = preprocess_image(image, IMG_WIDTH, IMG_HEIGHT) 163 | 164 | # Fit into 0-1 range, as the unet model expects this 165 | x_batchified = x_batchified/255.0 166 | 167 | # Run inference 168 | t_start = timer() 169 | y = agent.predict(model_id_img_seg, x_batchified) 170 | t_stop = timer() 171 | t_ms = np.round((t_stop - t_start) * 1000, decimals=0) 172 | 173 | y_mask = y[0] > 0.5 174 | agent.capture_data(model_id_img_seg, x_batchified, y.astype(np.float32)) 175 | 176 | return y_mask, t_ms 177 | 178 | 179 | def run_classification_inference(agent, filename): 180 | """Runs inference on the given image file. Returns prediction and model latency.""" 181 | # Check if the model for image classification is available 182 | # The application always uses the latest version of the model in the list of loaded models 183 | model_name_img_clf = config['mappings']['image-classification-app'] 184 | model_is_loaded = any([m['name']==model_name_img_clf for m in models_loaded]) 185 | if not model_is_loaded: 186 | logging.info('Model for image classification not loaded, waiting for deployment...') 187 | return None, None 188 | 189 | # Get the identifier of the currently loaded model 190 | model_dict_img_clf = next((x for x in models_loaded if x['name'] == model_name_img_clf), None) 191 | if not model_dict_img_clf: 192 | logging.info('Model for image classification not loaded, waiting for deployment...') 193 | return None, None 194 | model_id_img_clf = model_dict_img_clf['identifier'] 195 | 196 | logging.info('\nClassification inference with %s' % filename) 197 | image = PIL.Image.open(filename) 198 | image = image.convert(mode='RGB') 199 | 200 | # Preprocessing 201 | x_batchified = preprocess_image(image, IMG_WIDTH, IMG_HEIGHT) 202 | 203 | # Run inference with agent and time taken 204 | t_start = timer() 205 | y = agent.predict(model_id_img_clf, x_batchified) 206 | t_stop = timer() 207 | t_ms = np.round((t_stop - t_start) * 1000, decimals=0) 208 | 209 | agent.capture_data(model_id_img_clf, x_batchified, y) 210 | y = y.ravel() 211 | logging.info(y) 212 | 213 | img_clf_class_labels = ['normal', 'anomalous'] 214 | 215 | for indx, l in enumerate(img_clf_class_labels): 216 | logging.info('Class probability label "%s": %f' % (img_clf_class_labels[indx], y[indx])) 217 | return y, t_ms 218 | 219 | 220 | # Get list of supported model names 221 | models_supported = config['mappings'].values() 222 | 223 | # Initialize OTA model manager 224 | model_manager = app.OTAModelUpdate(device_name, iot_params, mqtt_host, mqtt_port, load_model, SM_EDGE_MODEL_PATH, models_supported) 225 | 226 | @flask_app.route('/') 227 | def homepage(): 228 | # Get a random image from the directory 229 | list_img_inf = glob.glob('./static/**/*.png') 230 | 231 | if len(list_img_inf) == 0: 232 | return render_template('main_noimg.html', 233 | loaded_models=models_loaded 234 | ) 235 | 236 | inference_img_path = random.choice(list_img_inf) 237 | inference_img_filename = re.search(r'(?<=\/static\/).+$', inference_img_path)[0] 238 | 239 | # Run inference on this image 240 | y_clf, t_ms_clf = run_classification_inference(edge_agent, inference_img_path) 241 | y_segm, t_ms_segm = run_segmentation_inference(edge_agent, inference_img_path) 242 | 243 | # Synthesize mask into binary image 244 | if y_segm is not None: 245 | segm_img_encoded = app.create_b64_img_from_mask(y_segm) 246 | segm_img_decoded_utf8 = segm_img_encoded.decode('utf-8') 247 | logging.info('Model latency: t_segm=%fms' % t_ms_segm) 248 | else: 249 | segm_img_encoded = None 250 | segm_img_decoded_utf8 = None 251 | 252 | # Extract predictions from the y array 253 | # Assuming that the entry at index=0 is the probability for "normal" and the other for "anomalous" 254 | clf_class_labels = ['normal', 'anomalous'] 255 | if y_clf is not None: 256 | y_clf_normal = np.round(y_clf[0], decimals=6) 257 | y_clf_anomalous = np.round(y_clf[1], decimals=6) 258 | y_clf_class = clf_class_labels[np.argmax(y_clf)] 259 | logging.info('Model latency: t_classification=%fms' % t_ms_clf) 260 | else: 261 | y_clf_normal = None 262 | y_clf_anomalous = None 263 | y_clf_class = None 264 | 265 | 266 | # Return rendered HTML page with predictions 267 | return render_template('main.html', 268 | loaded_models=models_loaded, 269 | image_file=inference_img_filename, 270 | y_clf_normal=y_clf_normal, 271 | y_clf_anomalous=y_clf_anomalous, 272 | y_clf_class=y_clf_class, 273 | y_segm_img=segm_img_decoded_utf8, 274 | latency_clf=t_ms_clf, 275 | latency_segm=t_ms_segm 276 | ) 277 | 278 | # INIT APP 279 | # Initially load models as defined in config file 280 | for model_config in config['models']: 281 | model_name = model_config['name'] 282 | model_version = model_config['version'] 283 | try: 284 | load_model(model_name, model_version) 285 | except Exception as e: 286 | logging.error('Model could not be loaded. Did you specify it properly in the config file?') 287 | raise e 288 | 289 | 290 | if __name__ == '__main__': 291 | try: 292 | if SM_APP_ENV == 'prod': 293 | serve(flask_app, host='0.0.0.0', port=8080) 294 | elif SM_APP_ENV == 'dev': 295 | flask_app.run(debug=False, use_reloader=False, host='0.0.0.0', port=8080) 296 | else: 297 | raise Exception('SM_APP_ENV needs to be either "prod" or "dev"') 298 | 299 | except KeyboardInterrupt as e: 300 | pass 301 | except Exception as e: 302 | logging.error(e) 303 | 304 | logging.info('Shutting down') 305 | 306 | for m in models_loaded: 307 | logging.info("Unloading model %s" % m) 308 | edge_agent.unload_model(m['identifier']) 309 | 310 | 311 | # Updating config file 312 | config['models'] = models_loaded 313 | 314 | with open(CONFIG_FILE_PATH, 'w') as f: 315 | json.dump(config, f) 316 | 317 | del model_manager 318 | del edge_agent 319 | del logger -------------------------------------------------------------------------------- /src/cloud/pipelines/image_classification/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import os 4 | import numpy as np 5 | import glob 6 | import boto3 7 | import time 8 | import sagemaker 9 | import sagemaker.session 10 | 11 | from sagemaker.workflow.parameters import ParameterInteger, ParameterString 12 | from sagemaker.sklearn.processing import SKLearnProcessor 13 | from sagemaker.estimator import Estimator 14 | from sagemaker.processing import ProcessingInput, ProcessingOutput 15 | from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep, CacheConfig 16 | from sagemaker.workflow.properties import PropertyFile 17 | from sagemaker.workflow.functions import Join, JsonGet 18 | from sagemaker.workflow.condition_step import ConditionStep 19 | from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo 20 | from sagemaker.inputs import TrainingInput, CreateModelInput 21 | from sagemaker.workflow.step_collections import RegisterModel 22 | from sagemaker.model_metrics import MetricsSource, ModelMetrics 23 | from sagemaker.model import Model 24 | from sagemaker.workflow.pipeline import Pipeline 25 | from sagemaker.image_uris import retrieve 26 | 27 | from botocore.exceptions import ClientError, ValidationError 28 | 29 | BASE_DIR = os.path.dirname(os.path.realpath(__file__)) 30 | 31 | def get_session(region, default_bucket): 32 | """Gets the sagemaker session based on the region. 33 | 34 | Args: 35 | region: the aws region to start the session 36 | default_bucket: the bucket to use for storing the artifacts 37 | 38 | Returns: 39 | `sagemaker.session.Session instance 40 | """ 41 | 42 | boto_session = boto3.Session(region_name=region) 43 | 44 | sagemaker_client = boto_session.client("sagemaker") 45 | runtime_client = boto_session.client("sagemaker-runtime") 46 | return sagemaker.session.Session( 47 | boto_session=boto_session, 48 | sagemaker_client=sagemaker_client, 49 | sagemaker_runtime_client=runtime_client, 50 | default_bucket=default_bucket, 51 | ) 52 | 53 | def get_pipeline( 54 | region, 55 | role=None, 56 | default_bucket=None, 57 | pipeline_name="defect-detection-img-classification-pipeline", 58 | base_job_prefix="defect-detection-img-classification", 59 | ): 60 | """Gets a SageMaker ML Pipeline instance working with on DefectDetection data. 61 | 62 | Args: 63 | region: AWS region to create and run the pipeline. 64 | role: IAM role to create and run steps and pipeline. 65 | default_bucket: the bucket to use for storing the artifacts 66 | 67 | Returns: 68 | an instance of a pipeline 69 | """ 70 | sagemaker_session = get_session(region, default_bucket) 71 | if role is None: 72 | role = sagemaker.session.get_execution_role(sagemaker_session) 73 | 74 | ## By enabling cache, if you run this pipeline again, without changing the input 75 | ## parameters it will skip the training part and reuse the previous trained model 76 | cache_config = CacheConfig(enable_caching=True, expire_after="30d") 77 | ts = time.strftime('%Y-%m-%d-%H-%M-%S') 78 | 79 | # Data prep 80 | processing_instance_type = ParameterString( # instance type for data preparation 81 | name="ProcessingInstanceType", 82 | default_value="ml.m5.xlarge" 83 | ) 84 | processing_instance_count = ParameterInteger( # number of instances used for data preparation 85 | name="ProcessingInstanceCount", 86 | default_value=1 87 | ) 88 | 89 | # Input shape 90 | # --> Image size (height and width, as we need only use square images) desired for training. The 91 | # pipeline will square the images to this size if they are not square already by adding padding. 92 | target_image_size = ParameterString( 93 | name="TargetImageSize", 94 | default_value="224" 95 | ) 96 | 97 | # Augement Count 98 | augment_count_normal = ParameterString( # by how many samples you want to augment the normal samples 99 | name="AugmentCountNormal", 100 | default_value="0" 101 | ) 102 | augment_count_anomalous = ParameterString( # by how many samples you want to augment the anomalous samples 103 | name="AugmentCountAnomalous", 104 | default_value="0" 105 | ) 106 | 107 | # Training 108 | training_instance_type = ParameterString( # instance type for training the model 109 | name="TrainingInstanceType", 110 | default_value="ml.p3.2xlarge" 111 | ) 112 | training_instance_count = ParameterInteger( # number of instances used to train your model 113 | name="TrainingInstanceCount", 114 | default_value=1 115 | ) 116 | training_epochs = ParameterString( 117 | name="TrainingEpochs", 118 | default_value="15" 119 | ) 120 | training_num_training_samples = ParameterString( 121 | name="TrainingNumTrainingSamples", 122 | default_value="3600" # Change this to the number of training samples used! 123 | ) 124 | 125 | # Dataset input data: S3 path 126 | input_data = ParameterString( 127 | name="InputData", 128 | default_value="", 129 | ) 130 | 131 | # Model Approval State 132 | model_approval_status = ParameterString( 133 | name="ModelApprovalStatus", 134 | default_value="PendingManualApproval" 135 | ) 136 | 137 | # Model package group name for registering in model registry 138 | model_package_group_name = ParameterString( 139 | name="ModelPackageGroupName", 140 | default_value="defect-detection-img-classification-model-group" 141 | ) 142 | 143 | 144 | aws_region = sagemaker_session.boto_region_name 145 | training_image = retrieve(framework='image-classification', region=aws_region, image_scope='training') 146 | 147 | # Hardcoded hyperparameters 148 | NUM_CLASSES = 2 149 | BATCH_SIZE = 8 150 | 151 | # The preprocessor 152 | preprocessor = SKLearnProcessor( 153 | framework_version="0.23-1", 154 | role=role, 155 | instance_type=processing_instance_type, 156 | instance_count=processing_instance_count, 157 | max_runtime_in_seconds=7200, 158 | ) 159 | 160 | # A preprocessing report to store some information from the preprocessing step for next steps 161 | preprocessing_report = PropertyFile( 162 | name='PreprocessingReport', 163 | output_name='preprocessing_report', 164 | path='preprocessing_report.json' 165 | ) 166 | 167 | # Preprocessing Step 168 | step_process = ProcessingStep( 169 | name="DefectDetectionPreprocessing", 170 | code=os.path.join(BASE_DIR, 'preprocessing.py'), ## this is the script defined above 171 | processor=preprocessor, 172 | inputs=[ 173 | ProcessingInput(source=input_data, destination='/opt/ml/processing/input') 174 | ], 175 | outputs=[ 176 | ProcessingOutput(output_name='train_data', source='/opt/ml/processing/train'), 177 | ProcessingOutput(output_name='test_data', source='/opt/ml/processing/test'), 178 | ProcessingOutput(output_name='val_data', source='/opt/ml/processing/val'), 179 | ProcessingOutput(output_name='preprocessing_report', source='/opt/ml/processing/report') 180 | ], 181 | job_arguments=[ 182 | '--split', '0.1', 183 | '--augment-count-normal', augment_count_normal, 184 | '--augment-count-anomalous', augment_count_anomalous, 185 | '--image-width', target_image_size, 186 | '--image-height', target_image_size 187 | ], 188 | property_files=[preprocessing_report] 189 | ) 190 | 191 | # Define Image Classification Estimator 192 | hyperparameters = { 193 | 'num_layers': 18, 194 | 'image_shape': Join(on=',', values=['3', target_image_size, target_image_size]), 195 | 'num_classes': NUM_CLASSES, 196 | 'mini_batch_size': BATCH_SIZE, 197 | 'num_training_samples': training_num_training_samples, 198 | 'epochs': training_epochs, 199 | 'learning_rate': 0.01, 200 | 'top_k': 2, 201 | 'use_pretrained_model': 1, 202 | 'precision_dtype': 'float32' 203 | } 204 | 205 | ic_estimator = Estimator( 206 | image_uri=training_image, 207 | role=role, 208 | instance_count=training_instance_count, 209 | instance_type=training_instance_type, 210 | volume_size = 50, 211 | max_run = 360000, 212 | input_mode= 'Pipe', 213 | base_job_name='img-classification-training', 214 | output_path='s3://{}/{}/{}/{}'.format(default_bucket, 'models', base_job_prefix, 'training-output'), 215 | hyperparameters=hyperparameters 216 | ) 217 | 218 | step_train = TrainingStep( 219 | name="DefectDetectionImageClassificationTrain", 220 | estimator=ic_estimator, 221 | inputs={ 222 | "train": TrainingInput( 223 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri, 224 | content_type="application/x-recordio", 225 | s3_data_type='S3Prefix' 226 | ), 227 | "validation": TrainingInput( 228 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs["val_data"].S3Output.S3Uri, 229 | content_type="application/x-recordio", 230 | s3_data_type='S3Prefix' 231 | ) 232 | }, 233 | cache_config=cache_config 234 | ) 235 | 236 | # Set up for the evaluation processing step 237 | evaluation_report = PropertyFile( 238 | name='EvaluationReport', 239 | output_name='evaluation_report', 240 | path='evaluation_report.json' 241 | ) 242 | 243 | evalation_processor = SKLearnProcessor( 244 | framework_version="0.23-1", 245 | role=role, 246 | instance_type=processing_instance_type, 247 | instance_count=processing_instance_count, 248 | max_runtime_in_seconds=7200 249 | ) 250 | 251 | step_eval = ProcessingStep( 252 | name="DefectDetectionEvaluation", 253 | code=os.path.join(BASE_DIR, 'evaluation.py'), ## this is the script defined above 254 | processor=evalation_processor, 255 | inputs=[ 256 | ProcessingInput(source=step_process.properties.ProcessingOutputConfig.Outputs["test_data"].S3Output.S3Uri, destination='/opt/ml/processing/test'), 257 | ProcessingInput(source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination='/opt/ml/processing/model') 258 | 259 | ], 260 | outputs=[ 261 | ProcessingOutput(output_name='evaluation_report', source='/opt/ml/processing/report') 262 | ], 263 | property_files=[evaluation_report], 264 | job_arguments=[ 265 | '--image-width', target_image_size, 266 | '--image-height', target_image_size 267 | ], 268 | ) 269 | 270 | model_metrics = ModelMetrics( 271 | model_statistics=MetricsSource( 272 | s3_uri="{}/evaluation_report.json".format( 273 | step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] 274 | ), 275 | content_type="application/json", 276 | ) 277 | ) 278 | 279 | # Register model step that will be conditionally executed 280 | step_register = RegisterModel( 281 | name="DefectDetectionImageClassificationRegister", 282 | estimator=ic_estimator, 283 | model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, 284 | content_types=["application/x-recordio"], 285 | response_types=["application/json"], 286 | inference_instances=["ml.c5.2xlarge", "ml.p3.2xlarge"], 287 | transform_instances=["ml.c5.xlarge"], 288 | model_package_group_name=model_package_group_name, 289 | model_metrics=model_metrics, 290 | approval_status=model_approval_status 291 | ) 292 | 293 | # Condition step for evaluating model quality and branching execution 294 | cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here 295 | left=JsonGet( 296 | step_name=step_eval.name, 297 | property_file=evaluation_report, 298 | json_path="multiclass_classification_metrics.accuracy.value", # This should follow the structure of your report_dict defined in the evaluate.py file. 299 | ), 300 | right=0.8, # You can change the threshold here 301 | ) 302 | step_cond = ConditionStep( 303 | name="DefectDetectionImageClassificationAccuracyCondition", 304 | conditions=[cond_lte], 305 | if_steps=[step_register], 306 | else_steps=[], 307 | ) 308 | 309 | pipeline = Pipeline( 310 | name=pipeline_name, 311 | parameters=[ 312 | processing_instance_type, 313 | processing_instance_count, 314 | target_image_size, 315 | augment_count_normal, 316 | augment_count_anomalous, 317 | training_instance_type, 318 | training_instance_count, 319 | training_num_training_samples, 320 | training_epochs, 321 | input_data, 322 | model_approval_status, 323 | model_package_group_name 324 | ], 325 | steps=[step_process, step_train, step_eval, step_cond], 326 | sagemaker_session=sagemaker_session, 327 | ) 328 | return pipeline -------------------------------------------------------------------------------- /src/cloud/semantic_segmentation_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Defect Detection: Semantic Segmentation - Pipeline Execution\n", 8 | "\n", 9 | "In this notebook, we will use the pipeline configured in the included python package under `pipelines` together with the defined code for preprocessing and training to automate the model training. It is easy to use such that you can simple drop in whatever input data for image classification you want and have it train a model automatically." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import boto3\n", 19 | "import sagemaker\n", 20 | "import time\n", 21 | "import uuid\n", 22 | "import json\n", 23 | "\n", 24 | "iot_client = boto3.client('iot')\n", 25 | "sts_client = boto3.client('sts')\n", 26 | "sm_client = boto3.client('sagemaker')\n", 27 | "\n", 28 | "# Get the account id\n", 29 | "account_id = sts_client.get_caller_identity()[\"Account\"]\n", 30 | "\n", 31 | "# Project Name as defined in your CloudFormation template\n", 32 | "PROJECT_NAME = ''\n", 33 | "\n", 34 | "region = boto3.Session().region_name\n", 35 | "role = sagemaker.get_execution_role()\n", 36 | "bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)\n", 37 | "\n", 38 | "# Change these to reflect your project/business name or if you want to separate ModelPackageGroup/Pipeline from the rest of your team\n", 39 | "model_package_group_name = 'defect-detection-semantic-segmentation-%s' % PROJECT_NAME\n", 40 | "job_prefix = 'defect-detection-semantic-segmentation'\n", 41 | "pipeline_name = 'defect-detection-semantic-segmentation-pipeline-%s' % PROJECT_NAME" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Getting the pipeline definition\n", 49 | "\n", 50 | "We use the `get_pipeline` method to create a pipeline DAG definition with our provided input. The input provided here is fixed for each pipeline you create or update, you cannot change these parameters with each execution (see usage of parameters in the cell below)." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "from pipelines.semantic_segmentation.pipeline import get_pipeline\n", 60 | "\n", 61 | "pipeline = get_pipeline(\n", 62 | " region=region,\n", 63 | " role=role,\n", 64 | " default_bucket=bucket_name,\n", 65 | " pipeline_name=pipeline_name,\n", 66 | " base_job_prefix=job_prefix\n", 67 | ")" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Creating the pipeline\n", 75 | "\n", 76 | "We create the pipeline (or update it in case it exists) with the previously defined DAG definition." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "pipeline.upsert(role_arn=role)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Starting the pipeline execution\n", 93 | "\n", 94 | "We now start the exeuction of the pipeline with a given set of parameters which we can alter for every execution." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "input_data_path = 's3://%s/' % bucket_name\n", 104 | "\n", 105 | "execution = pipeline.start(\n", 106 | " parameters=dict(\n", 107 | " InputData=input_data_path,\n", 108 | " TrainingInstanceType=\"ml.p3.2xlarge\",\n", 109 | " ModelApprovalStatus=\"Approved\",\n", 110 | " ModelPackageGroupName=model_package_group_name\n", 111 | " )\n", 112 | ")" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "### Check progress\n", 120 | "\n", 121 | "After execution started, you can always check the progress of your pipeline execution either by looking at the processing and training jobs in the SageMaker Console, using the built-in SageMaker Studio Pipeline visualization tools or using SDK methods like below." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "execution.describe()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Preparing trained model for edge\n", 138 | "\n", 139 | "Please proceed here only, if the execution of the training pipeline as successful. In this part of the workshop, we will prepare the model which you just trained in the pipeline for the deployment onto the edge device." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "compilation_output_sub_folder = 'models/' + job_prefix + '/compilation-output'\n", 149 | "edgepackaging_output_sub_folder = 'models/' + job_prefix + '/edge-packaging-output'\n", 150 | "\n", 151 | "# S3 Location to save the model artifact after compilation\n", 152 | "s3_compilation_output_location = 's3://{}/{}'.format(bucket_name, compilation_output_sub_folder)\n", 153 | "\n", 154 | "# S3 Location to save the model artifact after edge packaging\n", 155 | "s3_edgepackaging_output_location = 's3://{}/{}'.format(bucket_name, edgepackaging_output_sub_folder)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# Define some helper functions\n", 165 | "\n", 166 | "def get_latest_approved_s3_model_location(client, model_package_group):\n", 167 | " \"\"\"Returns the model location of the latest approved model version in a group\"\"\"\n", 168 | " response = client.list_model_packages(\n", 169 | " ModelPackageGroupName=model_package_group_name,\n", 170 | " ModelApprovalStatus='Approved'\n", 171 | " )\n", 172 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n", 173 | " model_artifact_location = sm_client.describe_model_package(ModelPackageName=latest_version['ModelPackageArn'])['InferenceSpecification']['Containers'][0]['ModelDataUrl']\n", 174 | " return model_artifact_location\n", 175 | "\n", 176 | "def get_latest_approved_model_version(client, model_package_group):\n", 177 | " \"\"\"Returns the model version of the latest approved model version in a group\"\"\"\n", 178 | " response = client.list_model_packages(\n", 179 | " ModelPackageGroupName=model_package_group_name,\n", 180 | " ModelApprovalStatus='Approved'\n", 181 | " )\n", 182 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n", 183 | " return latest_version['ModelPackageVersion']" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Run SageMaker Neo compilation job" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Retrieve some information on the model we just trained and registered in SageMaker Model Registry\n", 200 | "s3_model_artifact_location = get_latest_approved_s3_model_location(sm_client, model_package_group_name)\n", 201 | "print(s3_model_artifact_location)\n", 202 | "\n", 203 | "model_name = 'unet'\n", 204 | "compilation_job_name = '%s-%d' % (model_name, int(time.time()*1000))\n", 205 | "\n", 206 | "# Lets start a compilation job for the target architecture\n", 207 | "sm_client.create_compilation_job(\n", 208 | " CompilationJobName=compilation_job_name,\n", 209 | " RoleArn=role,\n", 210 | " InputConfig={\n", 211 | " 'S3Uri': s3_model_artifact_location,\n", 212 | " 'DataInputConfig': '{\"input_image\":[1,%d,%d,%d]}' % (3,224, 224),\n", 213 | " 'Framework': 'KERAS'\n", 214 | " },\n", 215 | " OutputConfig={\n", 216 | " 'S3OutputLocation': s3_compilation_output_location,\n", 217 | " 'TargetPlatform': { 'Os': 'LINUX', 'Arch': 'X86_64' }\n", 218 | " #'TargetPlatform': { 'Os': 'LINUX', 'Arch': 'ARM64', 'Accelerator': 'NVIDIA' },\n", 219 | " #'CompilerOptions': '{\"trt-ver\": \"7.1.3\", \"cuda-ver\": \"10.2\", \"gpu-code\": \"sm_53\"}'\n", 220 | " #'TargetPlatform': { 'Os': 'LINUX', 'Arch': 'ARM64'},\n", 221 | " #'TargetDevice': 'ml_c5'\n", 222 | " },\n", 223 | " StoppingCondition={ 'MaxRuntimeInSeconds': 900 }\n", 224 | ")\n", 225 | "\n", 226 | "# Poll the status of the job\n", 227 | "print('Started compilation job .', end='')\n", 228 | "while True:\n", 229 | " resp = sm_client.describe_compilation_job(CompilationJobName=compilation_job_name)\n", 230 | " if resp['CompilationJobStatus'] in ['STARTING', 'INPROGRESS']:\n", 231 | " print('.', end='')\n", 232 | " else:\n", 233 | " print(resp['CompilationJobStatus'], compilation_job_name)\n", 234 | " break\n", 235 | " time.sleep(5)\n", 236 | " \n", 237 | "if resp['CompilationJobStatus'] == 'COMPLETED':\n", 238 | " s3_compiled_model_artifact_location_fullpath = resp['ModelArtifacts']['S3ModelArtifacts']\n", 239 | " print(f'Compiled artifact location in S3: {s3_compiled_model_artifact_location_fullpath}')" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "### Running the SageMaker Edge Packaging job" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Run the edge packaging job\n", 256 | "edge_packaging_job_name='%s-%d' % (model_name, int(time.time()*1000))\n", 257 | "model_version=str(get_latest_approved_model_version(sm_client, model_package_group_name))\n", 258 | "\n", 259 | "# Start the edge packaging job\n", 260 | "resp = sm_client.create_edge_packaging_job(\n", 261 | " EdgePackagingJobName=edge_packaging_job_name,\n", 262 | " CompilationJobName=compilation_job_name,\n", 263 | " ModelName=model_name,\n", 264 | " ModelVersion=model_version,\n", 265 | " RoleArn=role,\n", 266 | " OutputConfig={\n", 267 | " 'S3OutputLocation': s3_edgepackaging_output_location\n", 268 | " }\n", 269 | ")\n", 270 | "\n", 271 | "# Poll the status of the job\n", 272 | "print('Started edge packaging job .', end='')\n", 273 | "while True:\n", 274 | " resp = sm_client.describe_edge_packaging_job(EdgePackagingJobName=edge_packaging_job_name)\n", 275 | " if resp['EdgePackagingJobStatus'] in ['STARTING', 'INPROGRESS']:\n", 276 | " print('.', end='')\n", 277 | " else:\n", 278 | " print(resp['EdgePackagingJobStatus'], compilation_job_name)\n", 279 | " break\n", 280 | " time.sleep(5)\n", 281 | " \n", 282 | "if resp['EdgePackagingJobStatus'] == 'COMPLETED':\n", 283 | " s3_packaged_model_artifact_location_fullpath = resp['ModelArtifact']\n", 284 | " print(f'Packaged artifact location in S3: {s3_packaged_model_artifact_location_fullpath}')" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "### Running IoT Job for deplyoment onto the edge" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "def split_s3_path(s3_path):\n", 301 | " path_parts=s3_path.replace(\"s3://\",\"\").split(\"/\")\n", 302 | " bucket=path_parts.pop(0)\n", 303 | " key=\"/\".join(path_parts)\n", 304 | " return bucket, key\n", 305 | "\n", 306 | "model_bucket, model_key = split_s3_path(s3_packaged_model_artifact_location_fullpath)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "resp = iot_client.create_job(\n", 316 | " jobId=str(uuid.uuid4()),\n", 317 | " targets=[\n", 318 | " 'arn:aws:iot:%s:%s:thinggroup/defect-detection-%s-group' % (region, account_id, PROJECT_NAME), \n", 319 | " ],\n", 320 | " document=json.dumps({\n", 321 | " 'type': 'new_model',\n", 322 | " 'model_version': model_version,\n", 323 | " 'model_name': model_name,\n", 324 | " 'model_package_bucket': model_bucket,\n", 325 | " 'model_package_key': model_key\n", 326 | " }),\n", 327 | " targetSelection='SNAPSHOT'\n", 328 | ")" 329 | ] 330 | } 331 | ], 332 | "metadata": { 333 | "instance_type": "ml.t3.medium", 334 | "interpreter": { 335 | "hash": "dca0ade3e726a953b501b15e8e990130d2b7799f14cfd9f4271676035ebe5511" 336 | }, 337 | "kernelspec": { 338 | "display_name": "Python 3.7.6 64-bit ('base': conda)", 339 | "name": "python3" 340 | }, 341 | "language_info": { 342 | "name": "python", 343 | "version": "" 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 4 348 | } -------------------------------------------------------------------------------- /src/cloud/image_classification_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Defect Detection: Image Classification - Pipeline Execution\n", 7 | "\n", 8 | "In this notebook, we will use the pipeline configured in the included python package under `pipelines` together with the defined code for preprocessing and training to automate the model training. It is easy to use such that you can simple drop in whatever input data for image classification you want and have it train a model automatically.\n", 9 | "\n", 10 | "### Expected data format\n", 11 | "\n", 12 | "The expected data format for image classification is .png or .jpg images sorted into a \"normal\" or \"anomalous\" prefix in S3. Thus, the `InputData` parameter of the pipeline needs to point to an S3 prefix which contains \"folders\" (S3 prefixes\") named \"normal\" and \"anomalous\". These paths will be used by the preprocessing script to create a RecordIO training data set." 13 | ], 14 | "metadata": {} 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "source": [ 20 | "import boto3\n", 21 | "import sagemaker\n", 22 | "import time\n", 23 | "import uuid\n", 24 | "import json\n", 25 | "\n", 26 | "iot_client = boto3.client('iot')\n", 27 | "sts_client = boto3.client('sts')\n", 28 | "sm_client = boto3.client('sagemaker')\n", 29 | "\n", 30 | "# Get the account id\n", 31 | "account_id = sts_client.get_caller_identity()[\"Account\"]\n", 32 | "\n", 33 | "# Project Name as defined in your CloudFormation template\n", 34 | "PROJECT_NAME = ''\n", 35 | "\n", 36 | "region = boto3.Session().region_name\n", 37 | "role = sagemaker.get_execution_role()\n", 38 | "bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)\n", 39 | "\n", 40 | "# Change these to reflect your project/business name or if you want to separate ModelPackageGroup/Pipeline from the rest of your team\n", 41 | "model_package_group_name = 'defect-detection-img-classification-%s' % PROJECT_NAME\n", 42 | "job_prefix = 'defect-detection-img-classification'\n", 43 | "pipeline_name = 'defect-detection-img-clf-pipeline-%s' % PROJECT_NAME" 44 | ], 45 | "outputs": [], 46 | "metadata": {} 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "source": [ 51 | "### Getting the pipeline definition\n", 52 | "\n", 53 | "We use the `get_pipeline` method to create a pipeline DAG definition with our provided input. The input provided here is fixed for each pipeline you create or update, you cannot change these parameters with each execution (see usage of parameters in the cell below)." 54 | ], 55 | "metadata": {} 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "source": [ 61 | "from pipelines.image_classification.pipeline import get_pipeline\n", 62 | "\n", 63 | "pipeline = get_pipeline(\n", 64 | " region=region,\n", 65 | " role=role,\n", 66 | " default_bucket=bucket_name,\n", 67 | " pipeline_name=pipeline_name,\n", 68 | " base_job_prefix=job_prefix\n", 69 | ")" 70 | ], 71 | "outputs": [], 72 | "metadata": {} 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "source": [ 77 | "### Creating the pipeline\n", 78 | "\n", 79 | "We create the pipeline (or update it in case it exists) with the previously defined DAG definition." 80 | ], 81 | "metadata": {} 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "source": [ 87 | "pipeline.upsert(role_arn=role)" 88 | ], 89 | "outputs": [], 90 | "metadata": {} 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "source": [ 95 | "### Starting the pipeline execution\n", 96 | "\n", 97 | "We now start the exeuction of the pipeline with a given set of parameters which we can alter for every execution." 98 | ], 99 | "metadata": {} 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "source": [ 105 | "input_data_path = 's3://%s/' % bucket_name\n", 106 | "\n", 107 | "execution = pipeline.start(\n", 108 | " parameters=dict(\n", 109 | " InputData=input_data_path,\n", 110 | " TrainingInstanceType=\"ml.p3.2xlarge\",\n", 111 | " ModelApprovalStatus=\"Approved\",\n", 112 | " ModelPackageGroupName=model_package_group_name,\n", 113 | " TargetImageSize=\"224\",\n", 114 | " AugmentCountAnomalous=\"1000\"\n", 115 | " )\n", 116 | ")" 117 | ], 118 | "outputs": [], 119 | "metadata": {} 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "source": [ 124 | "### Check progress\n", 125 | "\n", 126 | "After execution started, you can always check the progress of your pipeline execution either by looking at the processing and training jobs in the SageMaker Console, using the built-in SageMaker Studio Pipeline visualization tools or using SDK methods like below." 127 | ], 128 | "metadata": {} 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "source": [ 134 | "execution.describe()" 135 | ], 136 | "outputs": [], 137 | "metadata": {} 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "source": [ 142 | "## Preparing trained model for edge\n", 143 | "\n", 144 | "Please proceed here only, if the execution of the training pipeline as successful. In this part of the workshop, we will prepare the model which you just trained in the pipeline for the deployment onto the edge device." 145 | ], 146 | "metadata": {} 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "source": [ 152 | "compilation_output_sub_folder = 'models/' + job_prefix + '/compilation-output'\n", 153 | "edgepackaging_output_sub_folder = 'models/' + job_prefix + '/edge-packaging-output'\n", 154 | "\n", 155 | "# S3 Location to save the model artifact after compilation\n", 156 | "s3_compilation_output_location = 's3://{}/{}'.format(bucket_name, compilation_output_sub_folder)\n", 157 | "\n", 158 | "# S3 Location to save the model artifact after edge packaging\n", 159 | "s3_edgepackaging_output_location = 's3://{}/{}'.format(bucket_name, edgepackaging_output_sub_folder)" 160 | ], 161 | "outputs": [], 162 | "metadata": {} 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "source": [ 168 | "# Define some helper functions\n", 169 | "\n", 170 | "def get_latest_approved_s3_model_location(client, model_package_group):\n", 171 | " \"\"\"Returns the model location of the latest approved model version in a group\"\"\"\n", 172 | " response = client.list_model_packages(\n", 173 | " ModelPackageGroupName=model_package_group_name,\n", 174 | " ModelApprovalStatus='Approved'\n", 175 | " )\n", 176 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n", 177 | " model_artifact_location = sm_client.describe_model_package(ModelPackageName=latest_version['ModelPackageArn'])['InferenceSpecification']['Containers'][0]['ModelDataUrl']\n", 178 | " return model_artifact_location\n", 179 | "\n", 180 | "def get_latest_approved_model_version(client, model_package_group):\n", 181 | " \"\"\"Returns the model version of the latest approved model version in a group\"\"\"\n", 182 | " response = client.list_model_packages(\n", 183 | " ModelPackageGroupName=model_package_group_name,\n", 184 | " ModelApprovalStatus='Approved'\n", 185 | " )\n", 186 | " latest_version = max(response['ModelPackageSummaryList'], key=lambda x:x['ModelPackageVersion'])\n", 187 | " return latest_version['ModelPackageVersion']" 188 | ], 189 | "outputs": [], 190 | "metadata": {} 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "source": [ 195 | "### Run SageMaker Neo compilation job" 196 | ], 197 | "metadata": {} 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "source": [ 203 | "# Retrieve some information on the model we just trained and registered in SageMaker Model Registry\n", 204 | "s3_model_artifact_location = get_latest_approved_s3_model_location(sm_client, model_package_group_name)\n", 205 | "print(s3_model_artifact_location)\n", 206 | "\n", 207 | "model_name = 'img-classification'\n", 208 | "compilation_job_name = '%s-%d' % (model_name, int(time.time()*1000))\n", 209 | "\n", 210 | "# Lets start a compilation job for the target architecture\n", 211 | "sm_client.create_compilation_job(\n", 212 | " CompilationJobName=compilation_job_name,\n", 213 | " RoleArn=role,\n", 214 | " InputConfig={\n", 215 | " 'S3Uri': s3_model_artifact_location,\n", 216 | " 'DataInputConfig': '{\"data\": [1,3,224,224]}',\n", 217 | " 'Framework': 'MXNET'\n", 218 | " },\n", 219 | " OutputConfig={\n", 220 | " 'S3OutputLocation': s3_compilation_output_location,\n", 221 | " 'TargetPlatform': {'Os': 'LINUX', 'Arch': 'X86_64'}\n", 222 | " },\n", 223 | " StoppingCondition={ 'MaxRuntimeInSeconds': 900 }\n", 224 | ")\n", 225 | "\n", 226 | "# Poll the status of the job\n", 227 | "print('Started compilation job .', end='')\n", 228 | "while True:\n", 229 | " resp = sm_client.describe_compilation_job(CompilationJobName=compilation_job_name)\n", 230 | " if resp['CompilationJobStatus'] in ['STARTING', 'INPROGRESS']:\n", 231 | " print('.', end='')\n", 232 | " else:\n", 233 | " print(resp['CompilationJobStatus'], compilation_job_name)\n", 234 | " break\n", 235 | " time.sleep(5)\n", 236 | " \n", 237 | "if resp['CompilationJobStatus'] == 'COMPLETED':\n", 238 | " s3_compiled_model_artifact_location_fullpath = resp['ModelArtifacts']['S3ModelArtifacts']\n", 239 | " print(f'Compiled artifact location in S3: {s3_compiled_model_artifact_location_fullpath}')" 240 | ], 241 | "outputs": [], 242 | "metadata": {} 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "source": [ 247 | "### Running the SageMaker Edge Packaging job" 248 | ], 249 | "metadata": {} 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "source": [ 255 | "# Run the edge packaging job\n", 256 | "edge_packaging_job_name='%s-%d' % (model_name, int(time.time()*1000))\n", 257 | "model_version=str(get_latest_approved_model_version(sm_client, model_package_group_name))\n", 258 | "\n", 259 | "# Start the edge packaging job\n", 260 | "resp = sm_client.create_edge_packaging_job(\n", 261 | " EdgePackagingJobName=edge_packaging_job_name,\n", 262 | " CompilationJobName=compilation_job_name,\n", 263 | " ModelName=model_name,\n", 264 | " ModelVersion=model_version,\n", 265 | " RoleArn=role,\n", 266 | " OutputConfig={\n", 267 | " 'S3OutputLocation': s3_edgepackaging_output_location\n", 268 | " }\n", 269 | ")\n", 270 | "\n", 271 | "# Poll the status of the job\n", 272 | "print('Started edge packaging job .', end='')\n", 273 | "while True:\n", 274 | " resp = sm_client.describe_edge_packaging_job(EdgePackagingJobName=edge_packaging_job_name)\n", 275 | " if resp['EdgePackagingJobStatus'] in ['STARTING', 'INPROGRESS']:\n", 276 | " print('.', end='')\n", 277 | " else:\n", 278 | " print(resp['EdgePackagingJobStatus'], compilation_job_name)\n", 279 | " break\n", 280 | " time.sleep(5)\n", 281 | " \n", 282 | "if resp['EdgePackagingJobStatus'] == 'COMPLETED':\n", 283 | " s3_packaged_model_artifact_location_fullpath = resp['ModelArtifact']\n", 284 | " print(f'Packaged artifact location in S3: {s3_packaged_model_artifact_location_fullpath}')" 285 | ], 286 | "outputs": [], 287 | "metadata": {} 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "source": [ 292 | "### Running IoT Job for deplyoment onto the edge" 293 | ], 294 | "metadata": {} 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "source": [ 300 | "def split_s3_path(s3_path):\n", 301 | " path_parts=s3_path.replace(\"s3://\",\"\").split(\"/\")\n", 302 | " bucket=path_parts.pop(0)\n", 303 | " key=\"/\".join(path_parts)\n", 304 | " return bucket, key\n", 305 | "\n", 306 | "model_bucket, model_key = split_s3_path(s3_packaged_model_artifact_location_fullpath)" 307 | ], 308 | "outputs": [], 309 | "metadata": {} 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "source": [ 315 | "resp = iot_client.create_job(\n", 316 | " jobId=str(uuid.uuid4()),\n", 317 | " targets=[\n", 318 | " 'arn:aws:iot:%s:%s:thinggroup/defect-detection-%s-group' % (region, account_id, PROJECT_NAME), \n", 319 | " ],\n", 320 | " document=json.dumps({\n", 321 | " 'type': 'new_model',\n", 322 | " 'model_version': model_version,\n", 323 | " 'model_name': model_name,\n", 324 | " 'model_package_bucket': model_bucket,\n", 325 | " 'model_package_key': model_key\n", 326 | " }),\n", 327 | " targetSelection='SNAPSHOT'\n", 328 | ")" 329 | ], 330 | "outputs": [], 331 | "metadata": {} 332 | } 333 | ], 334 | "metadata": { 335 | "instance_type": "ml.t3.medium", 336 | "interpreter": { 337 | "hash": "dca0ade3e726a953b501b15e8e990130d2b7799f14cfd9f4271676035ebe5511" 338 | }, 339 | "kernelspec": { 340 | "display_name": "Python 3.7.6 64-bit ('base': conda)", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "name": "python", 345 | "version": "" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 4 350 | } -------------------------------------------------------------------------------- /setup/lambda-custom-resource/prepare_dev_package_cr.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | """ 4 | Lambda-backed custom resource function to create the SageMaker Edge Manager device package. 5 | Support SageMaker Edge Agent Version: 6 | """ 7 | import json 8 | import os 9 | import logging 10 | import stat 11 | from botocore.parsers import LOG 12 | import urllib3 13 | import boto3 14 | import tarfile 15 | import io 16 | from botocore.exceptions import ClientError 17 | 18 | http = urllib3.PoolManager() 19 | 20 | LOGGER = logging.getLogger() 21 | LOGGER.setLevel(logging.INFO) 22 | 23 | BUCKET_NAME = os.environ['BUCKET_NAME'] 24 | PROJECT_NAME = os.environ['PROJECT_NAME'] 25 | AWS_REGION = os.environ['AWS_REGION'] 26 | 27 | LOCAL_DIR_PREFIX = '/tmp/' # Needed for running in AWS Lambda 28 | 29 | iot_client = boto3.client('iot') 30 | sm_client = boto3.client('sagemaker') 31 | s3_client = boto3.client('s3') 32 | 33 | # Global variables 34 | # This information needs to match with what was defined in the CloudFormation template 35 | sm_edge_device_name = 'edge-device-defect-detection-%s' % PROJECT_NAME 36 | iot_policy_name = 'defect-detection-policy-%s' % PROJECT_NAME 37 | iot_thing_name = 'edge-device-%s' % PROJECT_NAME 38 | iot_thing_group_name='defect-detection-%s-group' % PROJECT_NAME 39 | sm_em_fleet_name = 'defect-detection-%s' % PROJECT_NAME 40 | role_alias = 'SageMakerEdge-%s' % sm_em_fleet_name 41 | 42 | 43 | def cfn_cleanup(): 44 | """Clean up resources created in the custom resources""" 45 | 46 | LOGGER.info('Deleting role alias if exists') 47 | try: 48 | iot_client.delete_role_alias(roleAlias=role_alias) 49 | except: 50 | LOGGER.info('Role alias deletion failed, continuing anyways') 51 | 52 | LOGGER.info('Deregistering device from edge fleet if exists') 53 | try: 54 | sm_client.deregister_devices( 55 | DeviceFleetName=sm_em_fleet_name, 56 | DeviceNames=[sm_edge_device_name] 57 | ) 58 | except: 59 | LOGGER.info('Device deregistration failed, continuing anyways') 60 | 61 | LOGGER.info('Detaching certificates') 62 | try: 63 | cert_arn = iot_client.list_thing_principals(thingName=iot_thing_name)['principals'][0] 64 | cert_id = cert_arn.split('/')[-1] 65 | iot_client.detach_policy(policyName=iot_policy_name, target=cert_arn) 66 | iot_client.detach_thing_principal(thingName=iot_thing_name, principal=cert_arn) 67 | iot_client.update_certificate(certificateId=cert_id, newStatus='INACTIVE') 68 | iot_client.delete_certificate(certificateId=cert_id, forceDelete=True) 69 | iot_client.delete_thing_group(thingGroupName=iot_thing_group_name) 70 | except: 71 | LOGGER.info('Detaching certificates failed, continuing anyways') 72 | 73 | 74 | 75 | 76 | def lambda_handler(event, context): 77 | '''Handle Lambda event from AWS''' 78 | 79 | try: 80 | LOGGER.info('REQUEST RECEIVED:\n %s', event) 81 | LOGGER.info('REQUEST RECEIVED:\n %s', context) 82 | if event['RequestType'] == 'Create': 83 | LOGGER.info('CREATE!') 84 | 85 | LOGGER.info('Starting device packaging...') 86 | try: 87 | prepare_device_package(event, context) 88 | send_response(event, context, "SUCCESS", 89 | {"Message": "Resource creation successful!"}) 90 | except Exception as e: 91 | send_response(event, context, "FAILED", {"Message": "Resource creation failed during device packaging!", "Error": str(e)}) 92 | elif event['RequestType'] == 'Update': 93 | LOGGER.info('UPDATE!') 94 | send_response(event, context, "SUCCESS", 95 | {"Message": "Resource update successful!"}) 96 | elif event['RequestType'] == 'Delete': 97 | LOGGER.info('DELETE!') 98 | # Start cleanup method 99 | cfn_cleanup() 100 | send_response(event, context, "SUCCESS", 101 | {"Message": "Resource deletion successful!"}) 102 | else: 103 | LOGGER.info('FAILED!') 104 | send_response(event, context, "FAILED", 105 | {"Message": "Unexpected event received from CloudFormation"}) 106 | except: #pylint: disable=W0702 107 | LOGGER.info('FAILED!') 108 | send_response(event, context, "FAILED", { 109 | "Message": "Exception during processing"}) 110 | 111 | 112 | def send_response(event, context, response_status, response_data): 113 | '''Send a resource manipulation status response to CloudFormation''' 114 | response_body = json.dumps({ 115 | "Status": response_status, 116 | "Reason": "See the details in CloudWatch Log Stream: " + context.log_stream_name, 117 | "PhysicalResourceId": context.log_stream_name, 118 | "StackId": event['StackId'], 119 | "RequestId": event['RequestId'], 120 | "LogicalResourceId": event['LogicalResourceId'], 121 | "Data": response_data 122 | }) 123 | 124 | print("Response body:") 125 | print(response_body) 126 | 127 | response_url = event['ResponseURL'] 128 | 129 | headers = { 130 | 'content-type' : '', 131 | 'content-length' : str(len(response_body)) 132 | } 133 | 134 | try: 135 | response = http.request('PUT', response_url, headers=headers, body=response_body) 136 | print("Status code:", response.status) 137 | 138 | except Exception as e: 139 | 140 | print("send(..) failed executing http.request(..):", e) 141 | 142 | 143 | def setup_agent(thing_group_name, thing_group_arn): 144 | """Creates configuration file and sets up SageMaker Edge Agent for deployment 145 | onto a Amazon S3 bucket. Registers a device with a device fleet, creates IoT 146 | certificates and attaches them to the previously created IoT thing. Saves 147 | certificates onto local disk to make it ready for uploading to S3. 148 | 149 | Args: 150 | thing_group_name (string): a name for the IoT thing group 151 | thing_group_arn (string): the ARN of the IoT thing group 152 | """ 153 | 154 | local_base_path = LOCAL_DIR_PREFIX + "agent/certificates/iot/edge_device_cert_%s.pem" 155 | relative_base_path = "agent/certificates/iot/edge_device_cert_%s.pem" 156 | thing_arn_template = thing_group_arn.replace('thinggroup', 'thing').replace(thing_group_name, '%s') 157 | cred_host = iot_client.describe_endpoint(endpointType='iot:CredentialProvider')['endpointAddress'] 158 | 159 | # Check length of device name string 160 | if len(sm_edge_device_name) > 64: 161 | LOGGER.error("Device name for edge device is too long. Needs to be <64 characters.") 162 | raise ClientError('Device name for edge device is longer than 64 characters. Please choose a shorter value for ProjectName.') 163 | 164 | # register the device in the fleet 165 | # the device name needs to have 36 chars 166 | dev = [{'DeviceName': sm_edge_device_name, 'IotThingName': iot_thing_name}] 167 | try: 168 | sm_client.describe_device(DeviceFleetName=sm_em_fleet_name, DeviceName=sm_edge_device_name) 169 | LOGGER.info("Device was already registered on SageMaker Edge Manager") 170 | except ClientError as e: 171 | if e.response['Error']['Code'] != 'ValidationException': raise e 172 | LOGGER.info("Registering a new device %s on fleet %s" % (sm_edge_device_name, sm_em_fleet_name)) 173 | sm_client.register_devices(DeviceFleetName=sm_em_fleet_name, Devices=dev) 174 | iot_client.add_thing_to_thing_group( 175 | thingGroupName=thing_group_name, 176 | thingGroupArn=thing_group_arn, 177 | thingName=iot_thing_name, 178 | thingArn=thing_arn_template % iot_thing_name 179 | ) 180 | 181 | # if you reach this point you need to create new certificates 182 | # generate the certificates 183 | cert = local_base_path % ('cert') 184 | key = local_base_path % ('pub') 185 | pub = local_base_path % ('key') 186 | 187 | # Relative paths needed for setting path in config file 188 | cert_relative = relative_base_path % ('cert') 189 | key_relative = relative_base_path % ('pub') 190 | pub_relative = relative_base_path % ('key') 191 | 192 | cert_meta=iot_client.create_keys_and_certificate(setAsActive=True) 193 | cert_arn = cert_meta['certificateArn'] 194 | with open(cert, 'w') as c: c.write(cert_meta['certificatePem']) 195 | with open(key, 'w') as c: c.write(cert_meta['keyPair']['PrivateKey']) 196 | with open(pub, 'w') as c: c.write(cert_meta['keyPair']['PublicKey']) 197 | 198 | # attach the certificates to the policy and to the thing 199 | iot_client.attach_policy(policyName=iot_policy_name, target=cert_arn) 200 | iot_client.attach_thing_principal(thingName=iot_thing_name, principal=cert_arn) 201 | 202 | LOGGER.info("Creating agent config JSON file") 203 | 204 | # Please note that the $WORKDIR variables need to be replaced by the absolute path of the working directory of your project. 205 | # If you follow the guide, the install script will automatically replace those. 206 | agent_params = { 207 | "sagemaker_edge_core_device_name": sm_edge_device_name, 208 | "sagemaker_edge_core_device_fleet_name": sm_em_fleet_name, 209 | "sagemaker_edge_core_region": AWS_REGION, 210 | "sagemaker_edge_provider_provider": "Aws", 211 | "sagemaker_edge_provider_provider_path" : "$WORKDIR/agent/lib/libprovider_aws.so", 212 | "sagemaker_edge_core_root_certs_path": "$WORKDIR/agent/certificates/root", 213 | "sagemaker_edge_provider_aws_ca_cert_file": "$WORKDIR/agent/certificates/iot/AmazonRootCA1.pem", 214 | "sagemaker_edge_provider_aws_cert_file": "$WORKDIR/%s" % cert_relative, 215 | "sagemaker_edge_provider_aws_cert_pk_file": "$WORKDIR/%s" % key_relative, 216 | "sagemaker_edge_provider_aws_iot_cred_endpoint": "https://%s/role-aliases/%s/credentials" % (cred_host,role_alias), 217 | "sagemaker_edge_core_capture_data_destination": "Cloud", 218 | "sagemaker_edge_provider_s3_bucket_name": BUCKET_NAME, 219 | "sagemaker_edge_core_folder_prefix": "edge-agent-inference-data-capture", 220 | "sagemaker_edge_core_capture_data_buffer_size": 30, 221 | "sagemaker_edge_core_capture_data_batch_size": 10, 222 | "sagemaker_edge_core_capture_data_push_period_seconds": 10, 223 | "sagemaker_edge_core_capture_data_base64_embed_limit": 2, 224 | "sagemaker_edge_log_verbose": False 225 | } 226 | with open(LOCAL_DIR_PREFIX + 'agent/conf/config_edge_device.json', 'w') as conf: 227 | conf.write(json.dumps(agent_params, indent=4)) 228 | 229 | 230 | def prepare_device_package(event, context): 231 | """Prepares the edge device package in a lambda function and uploads it to the S3 bucket""" 232 | 233 | # create a new thing group 234 | thing_group_arn = None 235 | agent_pkg_bucket = 'sagemaker-edge-release-store-us-west-2-linux-x64' 236 | agent_config_package_prefix = 'edge-device-configuration/agent/config.tgz' 237 | 238 | # check if edge agent package has already been built 239 | try: 240 | s3_client.download_file(Bucket=BUCKET_NAME, Key=agent_config_package_prefix, Filename='/tmp/dump') 241 | LOGGER.info('The agent configuration package was already built! Skipping...') 242 | quit() 243 | except ClientError as e: 244 | pass 245 | 246 | # Create a new thing group if not found yet 247 | try: 248 | thing_group_arn = iot_client.describe_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn'] 249 | LOGGER.info("Thing group found") 250 | except iot_client.exceptions.ResourceNotFoundException as e: 251 | LOGGER.info("Creating a new thing group") 252 | thing_group_arn = iot_client.create_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn'] 253 | 254 | LOGGER.info("Creating the directory structure for the agent") 255 | # create a structure for the agent files 256 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/root', exist_ok=True) 257 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/iot', exist_ok=True) 258 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/logs', exist_ok=True) 259 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/model', exist_ok=True) 260 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/conf', exist_ok=True) 261 | 262 | LOGGER.info("Downloading root certificate and agent binary") 263 | # then get some root certificates 264 | resp = http.request('GET', 'https://www.amazontrust.com/repository/AmazonRootCA1.pem') 265 | with open(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', 'w') as c: 266 | c.write(resp.data.decode('utf-8')) 267 | 268 | # this certificate validates the edge manage package 269 | s3_client.download_file( 270 | Bucket=agent_pkg_bucket, 271 | Key='Certificates/%s/%s.pem' % (AWS_REGION, AWS_REGION), 272 | Filename=LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION 273 | ) 274 | 275 | LOGGER.info("Adjusting file permissions of pem files") 276 | # adjust the permissions of the files 277 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', stat.S_IRUSR|stat.S_IRGRP) 278 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION, stat.S_IRUSR|stat.S_IRGRP) 279 | 280 | LOGGER.info("Processing the agent...") 281 | setup_agent(iot_thing_group_name, thing_group_arn ) 282 | 283 | LOGGER.info("Creating the final package...") 284 | with io.BytesIO() as f: 285 | with tarfile.open(fileobj=f, mode='w:gz') as tar: 286 | tar.add(LOCAL_DIR_PREFIX + 'agent', 'agent', recursive=True) 287 | f.seek(0) 288 | LOGGER.info("Uploading to S3") 289 | s3_client.upload_fileobj(f, Bucket=BUCKET_NAME, Key=agent_config_package_prefix) 290 | LOGGER.info("Done!") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Defect detection using computer vision at the edge with Amazon SageMaker 2 | 3 | This repository is related to our blog post [Detect industrial defects at low latency with computer vision at the edge with Amazon SageMaker Edge](https://aws.amazon.com/blogs/machine-learning/detect-industrial-defects-at-low-latency-with-computer-vision-at-the-edge-with-amazon-sagemaker-edge/) in the AWS Machine Learning blog. 4 | 5 | In this workshop, we will walk you through a step by step process to build and train computer vision models with Amazon SageMaker and package and deploy them to the edge with [SageMaker Edge Manager](https://aws.amazon.com/sagemaker/edge-manager/). The workshop focuses on a defect detection use case in an industrial setting with models like image classification, and semantic segmentation to detect defects across several object types. We will complete the MLOps lifecycle with continuous versioned over-the-air model updates and data capture to the cloud. 6 | 7 | > [!WARNING] 8 | > Please note that this sample is outdated. Since 26th of April 2024, SageMaker Edge Manager has been discontinued. Please refer to the respective [EOL documentation page](https://docs.aws.amazon.com/sagemaker/latest/dg/edge-eol.html) in order to learn about potential alternatives. 9 | 10 | ## Architecture 11 | 12 | The architecture we will build during this workshop is illustrated below. Several key components can be highlighted: 13 | 14 | 1. **Model development and training on the cloud**: This repository contains code for two pipelines based on [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html) for each of the two model types used (classification and segmentation). These pipelines will be built and executed in a SageMaker Studio notebook. 15 | 2. **Model deployment to the edge**: Once a model building pipeline executed successfully, models will be compiled with [SageMaker Neo](https://aws.amazon.com/sagemaker/neo/) and packaged with a [SageMaker Edge packaging job](https://docs.aws.amazon.com/sagemaker/latest/dg/edge-packaging-job.html). As such, they can be deployed onto the edge device via IoT Jobs. On the edge device an application is running which will receive the model deployment job payload via MQTT and download the relevant model package. 16 | 3. **Edge inference**: The edge device is running the actual application for defect detection. In this workshop, we will use an EC2 instance to simulate an edge device - but any hardware device (RaspberryPi, Nvidia Jetson) can be used as long as SageMaker Neo compilations are supported. During setup, a configuration package is being downloaded to edge device to configure SageMaker Edge Agent. The Edge Agent on the device can then load models deployed via OTA updates and make them available for prediction via a low-latency gRPC API (see [SageMaker Edge Manager documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/edge.html)). 17 | 18 | ![architecture](img/architecture.png) 19 | 20 | ## Dataset 21 | 22 | This workshop is designed to be used with any dataset for defect detection that includes labels and masks. To be able to use both models (see section [Models](#models)), you will need a dataset of labelled images (*normal* and *anomalous*) as well as a set of respective *ground truth masks* which identify where the defect on a part is located. To train the models with the provided pipeline without any major code adjustments, you merely need to upload the dataset in the format together with correct path prefixes in an S3 bucket. Please refer to the [Getting Started](#getting-started) guide below on more details for model training with a dataset. 23 | 24 | However, for simplicity of this walkthrough, we will showcase the end-to-end solution using the [KolektorSDD2](https://www.vicos.si/resources/kolektorsdd2/) dataset for defect detection. This dataset consists of over 3000 images of surface defects together with respective binary masks which identify the location of those defects in the image. This makes this dataset very much suitable for our use case. 25 | 26 | Below you can find examples of those images and their masks as provided in the dataset. The image was taken from the [website](https://www.vicos.si/resources/kolektorsdd2/) of the creators of the KolektorSDD2 dataset (see also [Bozic et al., 2021] under [References](#references) ). 27 | 28 | ![kolektorimg](img/kolektor_sdd2.png) 29 | 30 | ## Models 31 | 32 | In this workshop, you will build two types of machine learning models: 33 | 34 | * an image classification model using the [built-in SageMaker Image Classification algorithm](https://docs.aws.amazon.com/sagemaker/latest/dg/image-classification.html) based on the [MXNet framework](https://mxnet.apache.org/versions/1.8.0/) 35 | * a semantic segmentation model built with [Tensorflow/Keras](https://github.com/tensorflow/tensorflow) using the [UNET deep learning architecture](https://arxiv.org/abs/1505.04597) 36 | 37 | ## Directory structure of this repository 38 | 39 | This repository has the following directory structure: 40 | 41 | ``` 42 | ├── setup <-- contains the CloudFormation template for easy-to-use setup of AWS resources 43 | └── src <-- contains the actual source code for this project 44 | ├── cloud <-- contains the code for model training in the cloud and initiation of OTA deployments to the edge 45 | └── edge <-- contains the code that is running on the edge device 46 | ``` 47 | 48 | ### Edge code directory structure 49 | 50 | ``` 51 | src/edge 52 | ├── app <-- python module for this application 53 | │   ├── edgeagentclient.py <-- abstractions for calling edge agent gRPC APIs 54 | │   ├── logger.py <-- utilities for logging output to AWS IoT Core 55 | │   ├── ota.py <-- utilities for handling OTA IoT jobs 56 | │   └── util.py <-- additional utilities 57 | ├── install.py <-- install script for downloading and configuring edge agent 58 | ├── models_config.json <-- model configuration, also used for persisting model versions 59 | ├── run.py <-- runs the edge application 60 | ├── start_edge_agent.sh <-- starts the SM edge agent 61 | ├── static <-- contains static images for Flask app, download test images here 62 | └── templates <-- contains HTML Jinja templates for Flask app 63 | ``` 64 | 65 | ### Cloud code directory structure 66 | 67 | ``` 68 | src/cloud 69 | ├── image_classification_pipeline.ipynb <-- notebook for running the image classification pipeline 70 | ├── semantic_segmentation_pipeline.ipynb <-- notebook for running the semantic segmentation pipeline 71 | ├── data_preparation.ipynb <-- notebook for data preprocessing of the KolektorSDD2 dataset 72 | └── pipelines <-- model building code and pipeline definition 73 |    ├── get_pipeline_definition.py <-- CLI tool for CICD 74 |   ├── run_pipeline.py <-- CLI tool for CICD 75 |    ├── image_classification <-- contains the pipeline code for image classification 76 |    │   ├── evaluation.py <-- script to evaluate model performance on test dataset 77 |    │   ├── pipeline.py <-- pipeline definition 78 |    │   └── preprocessing.py <-- script for preprocessing (augmentation, train/test/val split) 79 |    └── semantic_segmentation <-- contains the pipeline code for semantic segmentation 80 |    ├── pipeline.py <-- pipeline definition 81 |    ├── preprocessing.py <-- script for preprocessing (augmentation, train/test/val split) 82 |    ├── requirements.txt <-- python dependencies needed for training 83 |    └── train_tf.py <-- training script for training the unet model 84 | 85 | ``` 86 | 87 | ## Walkthrough 88 | 89 | Please follow the steps below to start building your own edge ML project. You will create a CloudFormation stack to set up all necessary resources in the cloud and prepare an edge device for usage with SageMaker Edge Manager. You will then train models in the cloud and deployment to the edge device using AWS IoT. Please note that model training in the cloud and running inference on the edge are interdependent of each other. We recommend you start by setting up the edge device first and then train the models as a second step. This way, you can then directly deploy them to the edge after you have successfully trained the models. 90 | 91 | ### Setting up workshop resources by launching the CloudFormation stack 92 | 93 | 1. Launch a new CloudFormation stack with the provided template under `setup/template.yaml`. To learn about how to deploy CloudFormation stacks, please refer to the [documentation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-console-create-stack.html). 94 | 2. Define a name for the stack and enter a *Project Name* parameter, that is unique in your account. It must be compliant with Amazon S3 bucket names, so please choose a lowercase string here. The project name that you define during stack creation defines the name of many of the resources that are being created with the stack. Make sure to take note of this parameter. 95 | 3. Have a look at the CloudFormation stack outputs and take note of the provided information. 96 | 97 | #### What is being created by the CloudFormation stack? 98 | 99 | This stack configures several resources needed for this workshop. It sets up an IoT device together with certificates and roles, an Edge Manager fleet, registers the device with the fleet and creates a package for edge agent configuration which is being saved in the S3 bucket for this project. The following image illustrates the resources being created with the CloudFormation stack. 100 | 101 | ![edge config](img/cloudformation.png) 102 | 103 | ### Configuring the edge device 104 | 105 | 1. Launch an EC2 instance with Ubuntu Server 20 with SSH access (e.g. via [Session Manager](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager.html)) into a public subnet and make sure it gets assigned a public IP (you will need this later to access the web application). Ensure that it has access to the S3 buckets containing your configuration package (find the bucket name in the CloudFormation output). It will also need access to the bucket containing the SageMaker Edge Agent binary. For more information, refer to the [SageMaker Edge Manager documentation pages](https://docs.aws.amazon.com/sagemaker/latest/dg/edge-device-fleet-about.html). This EC2 instance will from now be considered our "edge device". 106 | 2. Clone this GitHub repository onto the edge device or simply copy the `src/edge` directory onto the edge device. 107 | 3. Install the dependencies by running `sudo apt update -y && sudo apt install -y build-essential procps` and `pip install -r requirements.txt` to install the necessary python dependencies. 108 | 4. Run the installation script by running `python3 install.py --project-name --account-id `. This script will download the edge agent configuration package created during the CloudFormation deployment, download the edge agent binary and also generate the protobuf agent stubs. A newly created directory `./agent/` contains the files for the edge agent. The following image illustrated what happens in the installation script: 109 | 110 | ![edge config](img/edge_config.png) 111 | 112 | 5. Create an environment variable to define the location of the agent directory. If you haven't changed your current directory, this would likely be `export SM_EDGE_AGENT_HOME=$PWD/agent`. 113 | 6. Start the edge agent by running `./start_edge_agent.sh`, which launches the edge agent on the unix socket `tmp/edge_agent`. You should now the able to interact with the edge agent from your application. 114 | 7. Before running the actual application, you need to define an environment variable which determines whether you want to run the app with the Flask development server or the with a production-ready uWSGI server (using [waitress](https://github.com/Pylons/waitress)). For now, lets use the production server by setting `export SM_APP_ENV=prod`. For debugging, you might want to later change this to `dev`. 115 | 8. Run the application with `python3 run.py` to initialize the application, verify cloud connectivity, connect to the edge agent. This application is a [Flask](https://flask.palletsprojects.com/en/2.0.x/) web application running port port 8080 which is integrated with SageMaker Edge Agent and AWS IoT for OTA updates. You will see that, if you have no models deployed yet and have not downloaded any test images, nothing will happen yet in the application. It will stay idle until it can access test images in the `/static` folder and run inference on those with a deployed model. In the next step, we will see how we can run automated model training with SageMaker Pipelines and deploy them onto the edge device for local inference. 116 | 9. Go to the EC2 dashboard and find the public IP address of your instance. Browse the public IP address on port 8080, i.e. `http://:8080`. You should now see the web application in your browser window. Ensure that you allow ingress on port 8080 in the security group attached to your instance (see [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/working-with-security-groups.html#adding-security-group-rule) for details on how to set this up). Also, make sure your local firewalls on your device allow ingress through port 8080. Refer to the [Troubleshooting](#troubleshooting-and-faq) section for further tips. 117 | 118 | ### Automated model training in the cloud with SageMaker Pipelines 119 | 120 | 1. Create a SageMaker Studio domain and user by following [this](https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks.html) guide in the documentation. Make sure that the IAM role used has access to the S3 bucket created during the CloudFormation deployment. 121 | 2. Clone this repository or copy the `src/cloud` directory onto the SageMaker Studio domain. 122 | 3. Prepare your dataset for training. In case you choose to use the KolektorSDD2 dataset like in this example, you can use the provided notebook under `src/cloud/data_preparation.ipynb` to download the dataset and partition it into the subdirectories needed for the training pipeline. With the provided pipeline code you can train two model types (image classification and semantic segmentation). You might want to set aside some images to be used for local inference. Download those onto the edge device and save them into the `static` folder so they can be used for inference by the edge application. Please note that we integrated a step in the preprocessing scripts to add a padding around the images in order to make them square, as the KolektorSDD2 dataset contains non-square images. If your provided images are already square, this step will be skipped. Just ensure that you use the same preprocessing for inference later (i.e. add padding if padding was added for training). 123 | 4. To use the pipelines without any code modifications you need to use structure your datasets as follows: 124 | * **Image Classification**: Your dataset needs to be split into `normal` and `anomalous` directories according to their respective label. Upload the data to your S3 bucket (e.g. under `s3:///data/img-classification/`). Thus, your normal images will be located in `s3:///data/img-classification/normal` and the anomalous ones in `s3:///data/img-classification/anomalous`. Train / test / validation split will be done automatically in the preprocessing step of the pipeline. 125 | * **Semantic Segmentation**: Your dataset needs to be split into `images` and `masks` directories. Upload the data to your S3 bucket (e.g. under `s3:///data/semantic-segmentation/`). Thus, your images will be located in `s3:///data/semantic-segmentation/images` and the binary masks in `s3:///data/semantic-segmentation/masks`. Train / test / validation split will be done automatically in the preprocessing step of the pipeline. 126 | 5. Execute the training pipeline: you will find a Jupyter Notebook for each of the model types in `src/cloud/`. Please adjust the project name you used during the CloudFormation deployment in the notebook. Also, you need to provide the S3 input data path as a parameter of the pipeline. Please make sure this aligns with the S3 path you used for uploading the dataset in step 3. You can monitor the pipeline execution in your SageMaker Studio domain. In case it finishes successfully, it should look similar to the one displayed below. 127 | 128 | ![pipeline](img/pipeline.png) 129 | 130 | ### Edge deployment and inference at edge 131 | 132 | 1. Once the pipeline finished successfully, your model is almost ready for use on the edge device. Verify that the latest model version in the model registry is approved to make it available for edge deployment. 133 | 2. Execute the following cells of the notebook to run model compilation with SageMaker Neo and then package the model for usage with SageMaker Edge Manager. 134 | 3. Finally, you can deploy the model package onto the edge by running the IoT Job as an Over-The-Air update. If your edge application is currently running, it should receive the OTA deployment job, download the model package and load it into the Edge Agent. 135 | 4. Verify that the deployment automation works by checking the log output on the edge device. You can also verify the successful deployment of a new model version by verifying the successful execution of the IoT job in the AWS IoT Core Console (under "Manage" --> "Jobs") as shown below. 136 | 137 | ![pipeline](img/iot_job.png) 138 | 139 | #### Persisting model configuration 140 | 141 | You can set which models should be loaded initially by configuring the `model_config.json` file. The application will instruct the edge agent to load these models upon startup. You can update model versions by creating IoT jobs from the cloud. The OTA IoT client running alongside the application will listen to the job topics and download the model accordingly. Please also note that for each new model you deploy you might have to adjust your application code accordingly (e.g. if your input shape changes). The structure of the `model_config.json` file with a sample configuration is shown below. 142 | 143 | In `"mappings"`, you can define which model should be used for each of the two inferences in the application this name needs to align with the model name you choose during OTA deployment. In `"models"`, information about the models loaded into the edge agent are persisted even after you shutdown the application. Please note that this is automatically filled out by the application and saved before you close out of the application. You do not need to manually configure this. In case you want to use a manually deployed model package with this application, you can instruct the application to load this model by manually adding a model definition into the JSON file under `"models"`. 144 | 145 | ```json 146 | { 147 | "mappings": { 148 | "image-classification-app": "img-classification", 149 | "image-segmentation-app": "unet" 150 | }, 151 | "models": [ 152 | { 153 | "name": "img-classification", 154 | "version": "1", 155 | "identifier": "img-classification-1" 156 | } 157 | ] 158 | } 159 | ``` 160 | 161 | #### Running inference on the edge device 162 | 163 | To run inference on the device, you need to have fulfilled the following requirements: 164 | 165 | * The edge agent on the edge device is properly configured and can successfully authenticate against AWS IoT 166 | * You have downloaded test images onto the edge device in the folder `static/` 167 | * You have deployed at least one of the two models (image classification or semantic segmentation) via OTA updates 168 | * The edge agent is running and the models could be loaded successfully (for troubleshooting check command line output or edge agent logs in `agent/logs/agent.log`) 169 | 170 | If everything is configured accordingly, you should see the edge application cycling through the provided images in the `static/` directory and run inference against both of the models. The result of the inference is then displayed in the web application. You can see a screenshot of the running web application below. The two models loaded into edge agent are displayed on the top, the incoming image from the camera stream is fed into the two models and the predictions are illustrated on the bottom of the page. 171 | 172 | ![inference_ui](img/inferece_ui.png) 173 | 174 | #### Continuously deploying new model versions to the edge 175 | 176 | You can now continuously retrain your model on new data or with new parameter configurations and deploy them onto the edge device by running again through steps 1-5 in [Automated model training in the cloud with Sagemaker Pipelines](#automated-model-training-in-the-cloud-with-sagemaker-pipelines). Your application on the edge device will automatically download the new model packages (if the version provided is higher than the one used currently). It then unloads old model version from the edge agent and loads the newer version once available. It persists its model configuration in the JSON file described in section 5 of [Automated model training in the cloud with Sagemaker Pipelines](#automated-model-training-in-the-cloud-with-sagemaker-pipelines). 177 | 178 | ### Productionizing the solution 179 | 180 | This workshop showcases a simple way of managing deployments of multiple CV models onto an edge device for defect detection use cases. For the sake of simplicity, we run certain steps in a manual fashion by e.g. preparing and deploying models from a Sagemaker Studio notebook. In a production setting, we recommend using dedicated pipelines both for the model building component as well as for the deployment component. Similar to the [MLOps reference architecture as outlined in the AWS blog](https://aws.amazon.com/blogs/apn/taming-machine-learning-on-aws-with-mlops-a-reference-architecture/), one would use Amazon EventBridge event rules to kick off the deployment process after a approval of a new version in the model registry has been detected. Likewise, the pipeline execution would be triggered by either a commit to a connected code repository or by other events that require retraining (e.g. detected model drift or new incoming data). 181 | 182 | ### Troubleshooting and FAQ 183 | 184 | * *The application running on EC2 is not accessible through via public IP address.* 185 | Make sure you opened up the port your application is running on in the security group attached to the instance. In case you cannot access the application through any other port than port 80, you could try to map the port 80 to 8080 by configuring a NAT redirect using the *iptables* command line tool as follows: `sudo iptables -t nat -A PREROUTING -p tcp --dport 80 -j REDIRECT --to-port 8080` 186 | * *The edge application fails due to errors related with SageMaker Edge Manager* 187 | You can try to restart the edge agent by killing the running process and starting edge agent again with the provided shell script. Make sure that `models_config.json` is configured such that desired models get loaded automatically upon application start. You can also check out the agent logs under `agent/logs` for troubleshooting. 188 | 189 | ## References 190 | 191 | * aws-samples GitHub Repository "ML@Edge with SageMaker Edge Manager" 192 | https://github.com/aws-samples/amazon-sagemaker-edge-manager-workshop 193 | * Ronneberger, O., Fischer, P., & Brox, T. (2015). U-Net: Convolutional Networks for Biomedical Image Segmentation. MICCAI. https://arxiv.org/abs/1505.04597 194 | * Bozic, J., Tabernik, D. & Skocaj, D. (2021). Mixed supervision for surface-defect detection: from weakly to fully supervised learning. Computers in Industry. https://arxiv.org/abs/2104.06064 195 | 196 | ## Security 197 | 198 | See [CONTRIBUTING](CONTRIBUTING.md) for more information. 199 | 200 | ## License 201 | 202 | This library is licensed under the MIT-0 License. See the [LICENSE](LICENSE) file. 203 | -------------------------------------------------------------------------------- /setup/template.yaml: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | AWSTemplateFormatVersion: '2010-09-09' 4 | Description: "This template will create the neccessary IoT resources for the SM Edge Manager Workshop" 5 | Parameters: 6 | ProjectName: 7 | Type: String 8 | Description: A name for this project. This value defines the naming of many of the hereby created resources. 9 | 10 | Resources: 11 | ProjectArtifactsBucket: 12 | Type: AWS::S3::Bucket 13 | DeletionPolicy: Delete 14 | Properties: 15 | BucketName: !Sub sm-edge-workshop-${ProjectName}-${AWS::AccountId} 16 | BucketEncryption: 17 | ServerSideEncryptionConfiguration: 18 | - ServerSideEncryptionByDefault: 19 | SSEAlgorithm: AES256 20 | VersioningConfiguration: 21 | Status: Enabled 22 | 23 | EdgeDeviceRole: 24 | Type: AWS::IAM::Role 25 | Properties: 26 | RoleName: !Sub EdgeDeviceRole-${ProjectName} 27 | AssumeRolePolicyDocument: 28 | Version: "2012-10-17" 29 | Statement: 30 | - Effect: Allow 31 | Principal: 32 | Service: 33 | - sagemaker.amazonaws.com 34 | - iot.amazonaws.com 35 | - credentials.iot.amazonaws.com 36 | Action: 37 | - 'sts:AssumeRole' 38 | Path: / 39 | Policies: 40 | - PolicyName: !Sub EdgeDeviceRolePolicy-${ProjectName} 41 | PolicyDocument: 42 | Version: "2012-10-17" 43 | Statement: 44 | - Effect: Allow 45 | Action: 46 | - 's3:GetObject' 47 | - 's3:PutObject' 48 | - 's3:ListBucket' 49 | - 's3:GetBucketLocation' 50 | Resource: 51 | - !GetAtt ProjectArtifactsBucket.Arn 52 | - !Join [ '/', [ !GetAtt ProjectArtifactsBucket.Arn, '*' ] ] 53 | - Effect: Allow 54 | Action: 55 | - 's3:ListAllMyBuckets' 56 | Resource: 57 | - '*' 58 | - Effect: Allow 59 | Action: 60 | - 'iot:CreateRoleAlias' 61 | - 'iot:DescribeRoleAlias' 62 | - 'iot:UpdateRoleAlias' 63 | - 'iot:TagResource' 64 | - 'iot:ListTagsForResource' 65 | Resource: 66 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:rolealias/SageMakerEdge*" 67 | - Effect: Allow 68 | Action: 69 | - 'iam:GetRole' 70 | - 'iam:PassRole' 71 | Resource: 72 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/*SageMaker*' 73 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/*Sagemaker*' 74 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/*sagemaker*' 75 | - !Sub 'arn:aws:iam::${AWS::AccountId}:role/EdgeDeviceRole-${ProjectName}' 76 | - Effect: Allow 77 | Action: 78 | - 'sagemaker:GetDeviceRegistration' 79 | - 'sagemaker:SendHeartbeat' 80 | - 'sagemaker:DescribeDevice' 81 | Resource: "*" 82 | - Effect: Allow 83 | Action: 84 | - 'iot:DescribeEndpoint' 85 | Resource: 86 | - '*' 87 | 88 | DefectDetectionIotPolicy: 89 | Type: AWS::IoT::Policy 90 | Properties: 91 | PolicyName: !Sub "defect-detection-policy-${ProjectName}" 92 | PolicyDocument: 93 | Version: '2012-10-17' 94 | Statement: 95 | - Effect: Allow 96 | Action: 97 | - iot:Connect 98 | Resource: 99 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:client/*" 100 | - Effect: Allow 101 | Action: 102 | - iot:Publish 103 | - iot:Receive 104 | Resource: 105 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topic/defect-detection/*" 106 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topic/$aws/*" 107 | - Effect: Allow 108 | Action: 109 | - iot:Subscribe 110 | Resource: 111 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topicfilter/defect-detection/*" 112 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topicfilter/$aws/*" 113 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topic/$aws/*" 114 | - Effect: Allow 115 | Action: 116 | - iot:UpdateThingShadow 117 | Resource: 118 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:topicfilter/defect-detection/*" 119 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:thing/edge-device-*" 120 | - Effect: Allow 121 | Action: 122 | - iot:AssumeRoleWithCertificate 123 | Resource: 124 | - !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:rolealias/SageMakerEdge-defect-detection-${ProjectName}" 125 | 126 | EdgeDeviceThing: 127 | Type: AWS::IoT::Thing 128 | Properties: 129 | ThingName: !Sub edge-device-${ProjectName} 130 | 131 | EdgeDeviceFleet: 132 | Type: AWS::SageMaker::DeviceFleet 133 | Properties: 134 | Description: String 135 | DeviceFleetName: !Sub defect-detection-${ProjectName} 136 | OutputConfig: 137 | S3OutputLocation: !Join [ '/', [ 's3:/', !Ref 'ProjectArtifactsBucket', 'data'] ] 138 | RoleArn: !GetAtt EdgeDeviceRole.Arn 139 | 140 | DefectDetectionModelPackageGroupImgClassification: 141 | Type: AWS::SageMaker::ModelPackageGroup 142 | Properties: 143 | ModelPackageGroupName: !Sub defect-detection-img-classification-${ProjectName} 144 | ModelPackageGroupDescription: A model package group for your image classification models 145 | 146 | DefectDetectionModelPackageGroupSemanticSegmentation: 147 | Type: AWS::SageMaker::ModelPackageGroup 148 | Properties: 149 | ModelPackageGroupName: !Sub defect-detection-semantic-segmentation-${ProjectName} 150 | ModelPackageGroupDescription: A model package group for your semantic segmentation models 151 | 152 | CustomResourceLambdaRole: 153 | Type: AWS::IAM::Role 154 | Properties: 155 | RoleName: !Sub CustomResourceLambdaRole-${ProjectName} 156 | AssumeRolePolicyDocument: 157 | Version: "2012-10-17" 158 | Statement: 159 | - Effect: Allow 160 | Principal: 161 | Service: 162 | - lambda.amazonaws.com 163 | Action: 164 | - 'sts:AssumeRole' 165 | Path: / 166 | Policies: 167 | - PolicyDocument: 168 | Version: "2012-10-17" 169 | Statement: 170 | - Effect: Allow 171 | Action: 172 | - 'iot:*' 173 | - 'sagemaker:*' 174 | Resource: 175 | - '*' 176 | - Effect: Allow 177 | Action: 178 | - 's3:*' 179 | Resource: 180 | - !GetAtt ProjectArtifactsBucket.Arn 181 | - !Join [ '', [ !GetAtt ProjectArtifactsBucket.Arn, '/*'] ] 182 | - 'arn:aws:s3:::sagemaker-edge-release-store-us-west-2-linux-x64/*' 183 | - 'arn:aws:s3:::sagemaker-edge-release-store-us-west-2-linux-x64' 184 | PolicyName: !Sub CustomResourceLambdaPolicy-${ProjectName} 185 | ManagedPolicyArns: 186 | - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole 187 | 188 | PrepareDevicePackageCustomResourceLambda: 189 | Type: AWS::Lambda::Function 190 | Properties: 191 | Role: !GetAtt CustomResourceLambdaRole.Arn 192 | FunctionName: !Sub PrepareDevicePackage-CfnCustomResource-${ProjectName} 193 | Runtime: python3.8 194 | Handler: index.lambda_handler 195 | Timeout: 15 196 | Code: 197 | ZipFile: | 198 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 199 | # SPDX-License-Identifier: MIT-0 200 | """ 201 | Lambda-backed custom resource function to create the SageMaker Edge Manager device package. 202 | Support SageMaker Edge Agent Version: 203 | """ 204 | import json 205 | import os 206 | import logging 207 | import stat 208 | from botocore.parsers import LOG 209 | import urllib3 210 | import boto3 211 | import tarfile 212 | import io 213 | from botocore.exceptions import ClientError 214 | 215 | http = urllib3.PoolManager() 216 | 217 | LOGGER = logging.getLogger() 218 | LOGGER.setLevel(logging.INFO) 219 | 220 | BUCKET_NAME = os.environ['BUCKET_NAME'] 221 | PROJECT_NAME = os.environ['PROJECT_NAME'] 222 | AWS_REGION = os.environ['AWS_REGION'] 223 | 224 | LOCAL_DIR_PREFIX = '/tmp/' # Needed for running in AWS Lambda 225 | 226 | iot_client = boto3.client('iot') 227 | sm_client = boto3.client('sagemaker') 228 | s3_client = boto3.client('s3') 229 | 230 | # Global variables 231 | # This information needs to match with what was defined in the CloudFormation template 232 | sm_edge_device_name = 'edge-device-defect-detection-%s' % PROJECT_NAME 233 | iot_policy_name = 'defect-detection-policy-%s' % PROJECT_NAME 234 | iot_thing_name = 'edge-device-%s' % PROJECT_NAME 235 | iot_thing_group_name='defect-detection-%s-group' % PROJECT_NAME 236 | sm_em_fleet_name = 'defect-detection-%s' % PROJECT_NAME 237 | role_alias = 'SageMakerEdge-%s' % sm_em_fleet_name 238 | 239 | 240 | def cfn_cleanup(): 241 | """Clean up resources created in the custom resources""" 242 | 243 | LOGGER.info('Deleting role alias if exists') 244 | try: 245 | iot_client.delete_role_alias(roleAlias=role_alias) 246 | except: 247 | LOGGER.info('Role alias deletion failed, continuing anyways') 248 | 249 | LOGGER.info('Deregistering device from edge fleet if exists') 250 | try: 251 | sm_client.deregister_devices( 252 | DeviceFleetName=sm_em_fleet_name, 253 | DeviceNames=[sm_edge_device_name] 254 | ) 255 | except: 256 | LOGGER.info('Device deregistration failed, continuing anyways') 257 | 258 | LOGGER.info('Detaching certificates') 259 | try: 260 | cert_arn = iot_client.list_thing_principals(thingName=iot_thing_name)['principals'][0] 261 | cert_id = cert_arn.split('/')[-1] 262 | iot_client.detach_policy(policyName=iot_policy_name, target=cert_arn) 263 | iot_client.detach_thing_principal(thingName=iot_thing_name, principal=cert_arn) 264 | iot_client.update_certificate(certificateId=cert_id, newStatus='INACTIVE') 265 | iot_client.delete_certificate(certificateId=cert_id, forceDelete=True) 266 | iot_client.delete_thing_group(thingGroupName=iot_thing_group_name) 267 | except: 268 | LOGGER.info('Detaching certificates failed, continuing anyways') 269 | 270 | 271 | 272 | 273 | def lambda_handler(event, context): 274 | '''Handle Lambda event from AWS''' 275 | 276 | try: 277 | LOGGER.info('REQUEST RECEIVED:\n %s', event) 278 | LOGGER.info('REQUEST RECEIVED:\n %s', context) 279 | if event['RequestType'] == 'Create': 280 | LOGGER.info('CREATE!') 281 | 282 | LOGGER.info('Starting device packaging...') 283 | try: 284 | prepare_device_package(event, context) 285 | send_response(event, context, "SUCCESS", 286 | {"Message": "Resource creation successful!"}) 287 | except Exception as e: 288 | send_response(event, context, "FAILED", {"Message": "Resource creation failed during device packaging!", "Error": str(e)}) 289 | elif event['RequestType'] == 'Update': 290 | LOGGER.info('UPDATE!') 291 | send_response(event, context, "SUCCESS", 292 | {"Message": "Resource update successful!"}) 293 | elif event['RequestType'] == 'Delete': 294 | LOGGER.info('DELETE!') 295 | # Start cleanup method 296 | cfn_cleanup() 297 | send_response(event, context, "SUCCESS", 298 | {"Message": "Resource deletion successful!"}) 299 | else: 300 | LOGGER.info('FAILED!') 301 | send_response(event, context, "FAILED", 302 | {"Message": "Unexpected event received from CloudFormation"}) 303 | except: #pylint: disable=W0702 304 | LOGGER.info('FAILED!') 305 | send_response(event, context, "FAILED", { 306 | "Message": "Exception during processing"}) 307 | 308 | 309 | def send_response(event, context, response_status, response_data): 310 | '''Send a resource manipulation status response to CloudFormation''' 311 | response_body = json.dumps({ 312 | "Status": response_status, 313 | "Reason": "See the details in CloudWatch Log Stream: " + context.log_stream_name, 314 | "PhysicalResourceId": context.log_stream_name, 315 | "StackId": event['StackId'], 316 | "RequestId": event['RequestId'], 317 | "LogicalResourceId": event['LogicalResourceId'], 318 | "Data": response_data 319 | }) 320 | 321 | print("Response body:") 322 | print(response_body) 323 | 324 | response_url = event['ResponseURL'] 325 | 326 | headers = { 327 | 'content-type' : '', 328 | 'content-length' : str(len(response_body)) 329 | } 330 | 331 | try: 332 | response = http.request('PUT', response_url, headers=headers, body=response_body) 333 | print("Status code:", response.status) 334 | 335 | except Exception as e: 336 | 337 | print("send(..) failed executing http.request(..):", e) 338 | 339 | 340 | def setup_agent(thing_group_name, thing_group_arn): 341 | """Creates configuration file and sets up SageMaker Edge Agent for deployment 342 | onto a Amazon S3 bucket. Registers a device with a device fleet, creates IoT 343 | certificates and attaches them to the previously created IoT thing. Saves 344 | certificates onto local disk to make it ready for uploading to S3. 345 | 346 | Args: 347 | thing_group_name (string): a name for the IoT thing group 348 | thing_group_arn (string): the ARN of the IoT thing group 349 | """ 350 | 351 | local_base_path = LOCAL_DIR_PREFIX + "agent/certificates/iot/edge_device_cert_%s.pem" 352 | relative_base_path = "agent/certificates/iot/edge_device_cert_%s.pem" 353 | thing_arn_template = thing_group_arn.replace('thinggroup', 'thing').replace(thing_group_name, '%s') 354 | cred_host = iot_client.describe_endpoint(endpointType='iot:CredentialProvider')['endpointAddress'] 355 | 356 | # Check length of device name string 357 | if len(sm_edge_device_name) > 64: 358 | LOGGER.error("Device name for edge device is too long. Needs to be <64 characters.") 359 | raise ClientError('Device name for edge device is longer than 64 characters. Please choose a shorter value for ProjectName.') 360 | 361 | # register the device in the fleet 362 | # the device name needs to have 36 chars 363 | dev = [{'DeviceName': sm_edge_device_name, 'IotThingName': iot_thing_name}] 364 | try: 365 | sm_client.describe_device(DeviceFleetName=sm_em_fleet_name, DeviceName=sm_edge_device_name) 366 | LOGGER.info("Device was already registered on SageMaker Edge Manager") 367 | except ClientError as e: 368 | if e.response['Error']['Code'] != 'ValidationException': raise e 369 | LOGGER.info("Registering a new device %s on fleet %s" % (sm_edge_device_name, sm_em_fleet_name)) 370 | sm_client.register_devices(DeviceFleetName=sm_em_fleet_name, Devices=dev) 371 | iot_client.add_thing_to_thing_group( 372 | thingGroupName=thing_group_name, 373 | thingGroupArn=thing_group_arn, 374 | thingName=iot_thing_name, 375 | thingArn=thing_arn_template % iot_thing_name 376 | ) 377 | 378 | # if you reach this point you need to create new certificates 379 | # generate the certificates 380 | cert = local_base_path % ('cert') 381 | key = local_base_path % ('pub') 382 | pub = local_base_path % ('key') 383 | 384 | # Relative paths needed for setting path in config file 385 | cert_relative = relative_base_path % ('cert') 386 | key_relative = relative_base_path % ('pub') 387 | pub_relative = relative_base_path % ('key') 388 | 389 | cert_meta=iot_client.create_keys_and_certificate(setAsActive=True) 390 | cert_arn = cert_meta['certificateArn'] 391 | with open(cert, 'w') as c: c.write(cert_meta['certificatePem']) 392 | with open(key, 'w') as c: c.write(cert_meta['keyPair']['PrivateKey']) 393 | with open(pub, 'w') as c: c.write(cert_meta['keyPair']['PublicKey']) 394 | 395 | # attach the certificates to the policy and to the thing 396 | iot_client.attach_policy(policyName=iot_policy_name, target=cert_arn) 397 | iot_client.attach_thing_principal(thingName=iot_thing_name, principal=cert_arn) 398 | 399 | LOGGER.info("Creating agent config JSON file") 400 | 401 | # Please note that the $WORKDIR variables need to be replaced by the absolute path of the working directory of your project. 402 | # If you follow the guide, the install script will automatically replace those. 403 | agent_params = { 404 | "sagemaker_edge_core_device_name": sm_edge_device_name, 405 | "sagemaker_edge_core_device_fleet_name": sm_em_fleet_name, 406 | "sagemaker_edge_core_region": AWS_REGION, 407 | "sagemaker_edge_provider_provider": "Aws", 408 | "sagemaker_edge_provider_provider_path" : "$WORKDIR/agent/lib/libprovider_aws.so", 409 | "sagemaker_edge_core_root_certs_path": "$WORKDIR/agent/certificates/root", 410 | "sagemaker_edge_provider_aws_ca_cert_file": "$WORKDIR/agent/certificates/iot/AmazonRootCA1.pem", 411 | "sagemaker_edge_provider_aws_cert_file": "$WORKDIR/%s" % cert_relative, 412 | "sagemaker_edge_provider_aws_cert_pk_file": "$WORKDIR/%s" % key_relative, 413 | "sagemaker_edge_provider_aws_iot_cred_endpoint": "https://%s/role-aliases/%s/credentials" % (cred_host,role_alias), 414 | "sagemaker_edge_core_capture_data_destination": "Cloud", 415 | "sagemaker_edge_provider_s3_bucket_name": BUCKET_NAME, 416 | "sagemaker_edge_core_folder_prefix": "edge-agent-inference-data-capture", 417 | "sagemaker_edge_core_capture_data_buffer_size": 30, 418 | "sagemaker_edge_core_capture_data_batch_size": 10, 419 | "sagemaker_edge_core_capture_data_push_period_seconds": 10, 420 | "sagemaker_edge_core_capture_data_base64_embed_limit": 2, 421 | "sagemaker_edge_log_verbose": False 422 | } 423 | with open(LOCAL_DIR_PREFIX + 'agent/conf/config_edge_device.json', 'w') as conf: 424 | conf.write(json.dumps(agent_params, indent=4)) 425 | 426 | 427 | def prepare_device_package(event, context): 428 | """Prepares the edge device package in a lambda function and uploads it to the S3 bucket""" 429 | 430 | # create a new thing group 431 | thing_group_arn = None 432 | agent_pkg_bucket = 'sagemaker-edge-release-store-us-west-2-linux-x64' 433 | agent_config_package_prefix = 'edge-device-configuration/agent/config.tgz' 434 | 435 | # check if edge agent package has already been built 436 | try: 437 | s3_client.download_file(Bucket=BUCKET_NAME, Key=agent_config_package_prefix, Filename='/tmp/dump') 438 | LOGGER.info('The agent configuration package was already built! Skipping...') 439 | quit() 440 | except ClientError as e: 441 | pass 442 | 443 | # Create a new thing group if not found yet 444 | try: 445 | thing_group_arn = iot_client.describe_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn'] 446 | LOGGER.info("Thing group found") 447 | except iot_client.exceptions.ResourceNotFoundException as e: 448 | LOGGER.info("Creating a new thing group") 449 | thing_group_arn = iot_client.create_thing_group(thingGroupName=iot_thing_group_name)['thingGroupArn'] 450 | 451 | LOGGER.info("Creating the directory structure for the agent") 452 | # create a structure for the agent files 453 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/root', exist_ok=True) 454 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/certificates/iot', exist_ok=True) 455 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/logs', exist_ok=True) 456 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/model', exist_ok=True) 457 | os.makedirs(LOCAL_DIR_PREFIX + 'agent/conf', exist_ok=True) 458 | 459 | LOGGER.info("Downloading root certificate and agent binary") 460 | # then get some root certificates 461 | resp = http.request('GET', 'https://www.amazontrust.com/repository/AmazonRootCA1.pem') 462 | with open(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', 'w') as c: 463 | c.write(resp.data.decode('utf-8')) 464 | 465 | # this certificate validates the edge manage package 466 | s3_client.download_file( 467 | Bucket=agent_pkg_bucket, 468 | Key='Certificates/%s/%s.pem' % (AWS_REGION, AWS_REGION), 469 | Filename=LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION 470 | ) 471 | 472 | LOGGER.info("Adjusting file permissions of pem files") 473 | # adjust the permissions of the files 474 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/iot/AmazonRootCA1.pem', stat.S_IRUSR|stat.S_IRGRP) 475 | os.chmod(LOCAL_DIR_PREFIX + 'agent/certificates/root/%s.pem' % AWS_REGION, stat.S_IRUSR|stat.S_IRGRP) 476 | 477 | LOGGER.info("Processing the agent...") 478 | setup_agent(iot_thing_group_name, thing_group_arn ) 479 | 480 | LOGGER.info("Creating the final package...") 481 | with io.BytesIO() as f: 482 | with tarfile.open(fileobj=f, mode='w:gz') as tar: 483 | tar.add(LOCAL_DIR_PREFIX + 'agent', 'agent', recursive=True) 484 | f.seek(0) 485 | LOGGER.info("Uploading to S3") 486 | s3_client.upload_fileobj(f, Bucket=BUCKET_NAME, Key=agent_config_package_prefix) 487 | LOGGER.info("Done!") 488 | Environment: 489 | Variables: 490 | BUCKET_NAME: !Ref ProjectArtifactsBucket 491 | PROJECT_NAME: !Ref ProjectName 492 | 493 | PrepareDevicePackageCR: 494 | Type: Custom::PrepareDevicePackage 495 | Properties: 496 | ServiceToken: !GetAtt PrepareDevicePackageCustomResourceLambda.Arn 497 | 498 | Outputs: 499 | EdgeDeviceIoTThingOutput: 500 | Description: The edge device IoT Thing where SageMaker Edge Manager will be running 501 | Value: !Ref EdgeDeviceThing 502 | EdgeDeviceFleetOutput: 503 | Description: The edge device fleet of SageMaker Edge Manager which contains the IoT things 504 | Value: !Ref EdgeDeviceFleet 505 | EdgeDeviceRoleOutput: 506 | Description: The IAM role which is mapped to the edge device certificate 507 | Value: !Ref EdgeDeviceRole 508 | EdgeDeviceRoleAliasOutput: 509 | Description: The IoT role alias which connects the device certifacte to an IAM role 510 | Value: !Sub "arn:aws:iot:${AWS::Region}:${AWS::AccountId}:rolealias/SageMakerEdge-defect-detection-${ProjectName}" 511 | ArtifactsBucketOutput: 512 | Description: The S3 bucket which contains the packaged edge agent configuration files 513 | Value: !Ref ProjectArtifactsBucket 514 | --------------------------------------------------------------------------------